Spaces:

michaelthwan
/

digest-everything-gpt

Sleeping

App Files Files Community

michaelthwan commited on Jun 12, 2023

Commit

6f61bb9

1 Parent(s): 544aeb4

import project

Browse files

Files changed (15) hide show

.gitignore +131 -0
README.md +66 -13
config/config.yaml +9 -0
digester/chatgpt_service.py +339 -0
digester/gradio_method_service.py +392 -0
digester/gradio_ui_service.py +269 -0
digester/test_chatgpt.py +106 -0
digester/test_youtube_chain.py +102 -0
digester/util.py +86 -0
img/final_full_summary.png +0 -0
img/in_process.png +0 -0
img/multi_language.png +0 -0
img/n_things_example.png +0 -0
main.py +28 -0
requirements.txt +7 -0

.gitignore CHANGED Viewed

	@@ -1 +1,132 @@

































































































































1	/.idea/*

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
 /.idea/*
+config_secret.yaml
+*report*.md

README.md CHANGED Viewed

@@ -1,13 +1,66 @@
----
-title: Digest Everything Gpt
-emoji: 💻
-colorFrom: green
-colorTo: green
-sdk: gradio
-sdk_version: 3.34.0
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# DigestEverythingGPT
+DigestEverythingGPT provides world-class content summarization/query tool that leverages ChatGPT/LLMs to help users
+quickly understand essential information from various forms of content, such as podcasts, YouTube videos, and PDF
+documents.
+The prompt engineering is **chained and tuned** so that is result is of high quality and fast. It is not a simple single
+query and response tool.
+# Showcases
+**Example of summary**
+- "OpenAssistant RELEASED! The world's best open-source Chat AI!" (https://www.youtube.com/watch?v=ddG2fM9i4Kk)
+![final_full_summary](/img/final_full_summary.png)
+**DigestEverythingGPT's final output will adopt to video type.**
+- For example, for the video "17 cheap purchases that save me
+  time" (https://www.youtube.com/watch?v=f7Lfukf0IKY&t=3s&ab_channel=AliAbdaal)
+- it shown the summary with and specific 17 things correctly.
+![n_things_example](/img/n_things_example.png)
+**LLM Loading in progress screen - chained prompt engineering, batched inference, etc.**
+![in_process](/img/in_process.png)
+**Support for multiple languages** regardless of video language
+![multi_language](/img/multi_language.png)
+# Live website
+[TODO]
+# Features
+- **Content Summarization**:
+    - Automatically generate concise summaries of various types of content, allowing users to save time and make
+      informed decisions for in-depth engagement.
+    - Chained/Batched/Advanced prompt engineering for great quality/faster results.
+- **Interactive "Ask" Feature** (in progress):
+    - Users can pose questions to the tool and receive answers extracted from specific sections within the full content.
+- **Cross-Medium Support**:
+    - DigestEverythingGPT is designed to work with a wide range of content mediums.
+    - Currently, the tool supports
+        - YouTube videos [beta]
+        - podcasts (in progress)
+        - PDF documents (in progress)
+# Installation
+Use python 3.10+ (tested in 3.10.8). Install using requirement.txt then launch gradio UI using main.py
+```
+pip install -r requirements.txt
+python main.py
+```
+# License
+DigestEverything-GPT is licensed under the MIT License.

config/config.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio:
+  concurrent: 20
+  port: 7860
+openai:
+  api_url: "https://api.openai.com/v1/chat/completions"
+  content_token: 3200 # tokens per content_main (e.g. transcript). If exceed it will be splitted and iterated
+  timeout_sec: 25
+  max_retry: 2
+  api_key: ""

digester/chatgpt_service.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import json
+import logging
+import re
+import threading
+import time
+import traceback
+import requests
+from digester.util import get_config, Prompt, get_token, get_first_n_tokens_and_remaining, provide_text_with_css, GradioInputs
+timeout_bot_msg = "Request timeout. Network error"
+SYSTEM_PROMPT = "Be a assistant to digest youtube, podcast content to give summaries and insights"
+TIMEOUT_MSG = f'{provide_text_with_css("ERROR", "red")} Request timeout.'
+TOKEN_EXCEED_MSG = f'{provide_text_with_css("ERROR", "red")} Exceed token but it should not happen and should be splitted.'
+# This piece of code heavily reference
+# - https://github.com/GaiZhenbiao/ChuanhuChatGPT
+# - https://github.com/binary-husky/chatgpt_academic
+config = get_config()
+class LLMService:
+    @staticmethod
+    def report_exception(chatbot, history, chat_input, chat_output):
+        chatbot.append((chat_input, chat_output))
+        history.append(chat_input)
+        history.append(chat_output)
+    @staticmethod
+    def get_full_error(chunk, stream_response):
+        while True:
+            try:
+                chunk += next(stream_response)
+            except:
+                break
+        return chunk
+    @staticmethod
+    def generate_payload(api_key, gpt_model, inputs, history, stream):
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {api_key}"
+        }
+        conversation_cnt = len(history) // 2
+        messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+        if conversation_cnt:
+            for index in range(0, 2 * conversation_cnt, 2):
+                what_i_have_asked = {}
+                what_i_have_asked["role"] = "user"
+                what_i_have_asked["content"] = history[index]
+                what_gpt_answer = {}
+                what_gpt_answer["role"] = "assistant"
+                what_gpt_answer["content"] = history[index + 1]
+                if what_i_have_asked["content"] != "":
+                    if what_gpt_answer["content"] == "": continue
+                    if what_gpt_answer["content"] == timeout_bot_msg: continue
+                    messages.append(what_i_have_asked)
+                    messages.append(what_gpt_answer)
+                else:
+                    messages[-1]['content'] = what_gpt_answer['content']
+        what_i_ask_now = {}
+        what_i_ask_now["role"] = "user"
+        what_i_ask_now["content"] = inputs
+        messages.append(what_i_ask_now)
+        payload = {
+            "model": gpt_model,
+            "messages": messages,
+            "temperature": 1.0,
+            "top_p": 1.0,
+            "n": 1,
+            "stream": stream,
+            "presence_penalty": 0,
+            "frequency_penalty": 0,
+        }
+        print(f"generate_payload() LLM: {gpt_model}, conversation_cnt: {conversation_cnt}")
+        print(f"\n[[[[[INPUT]]]]]\n{inputs}")
+        print(f"[[[[[OUTPUT]]]]]")
+        return headers, payload
+class ChatGPTService:
+    @staticmethod
+    def say(user_say, chatbot_say, chatbot, history, status, source_md, is_append=True):
+        if is_append:
+            chatbot.append((user_say, chatbot_say))
+        else:
+            chatbot[-1] = (user_say, chatbot_say)
+        yield chatbot, history, status, source_md
+    @staticmethod
+    def get_reduce_token_percent(text):
+        try:
+            pattern = r"(\d+)\s+tokens\b"
+            match = re.findall(pattern, text)
+            EXCEED_ALLO = 500
+            max_limit = float(match[0]) - EXCEED_ALLO
+            current_tokens = float(match[1])
+            ratio = max_limit / current_tokens
+            assert ratio > 0 and ratio < 1
+            return ratio, str(int(current_tokens - max_limit))
+        except:
+            return 0.5, 'Unknown'
+    @staticmethod
+    def trigger_callgpt_pipeline(prompt_obj: Prompt, prompt_show_user: str, g_inputs: GradioInputs, is_timestamp=False):
+        chatbot, history, source_md, api_key, gpt_model = g_inputs.chatbot, g_inputs.history, f"[{g_inputs.source_textbox}] {g_inputs.source_target_textbox}", g_inputs.apikey_textbox, g_inputs.gpt_model_textbox
+        yield from ChatGPTService.say(prompt_show_user, f"{provide_text_with_css('INFO', 'blue')} waiting for ChatGPT's response.", chatbot, history, "Success", source_md)
+        prompts = ChatGPTService.split_prompt_content(prompt_obj, is_timestamp)
+        full_gpt_response = ""
+        for i, prompt in enumerate(prompts):
+            yield from ChatGPTService.say(None, f"{provide_text_with_css('INFO', 'blue')} Processing Batch {i + 1} / {len(prompts)}",
+                                          chatbot, history, "Success", source_md)
+            prompt_str = f"{prompt.prompt_prefix}{prompt.prompt_main}{prompt.prompt_suffix}"
+            gpt_response = yield from ChatGPTService.single_call_chatgpt_with_handling(
+                source_md, prompt_str, prompt_show_user, chatbot, api_key, gpt_model, history=[]
+            )
+            chatbot[-1] = (prompt_show_user, gpt_response)
+            # seems no need chat history now (have it later?)
+            # history.append(prompt_show_user)
+            # history.append(gpt_response)
+            full_gpt_response += gpt_response
+            yield chatbot, history, "Success", source_md  # show gpt output
+        return full_gpt_response, len(prompts)
+    @staticmethod
+    def split_prompt_content(prompt: Prompt, is_timestamp=False) -> list:
+        """
+        Split the prompt.prompt_main into multiple parts, each part is less than <content_token=3500> tokens
+        Then return all prompts object
+        """
+        prompts = []
+        MAX_CONTENT_TOKEN = config.get('openai').get('content_token')
+        if not is_timestamp:
+            temp_prompt_main = prompt.prompt_main
+            while True:
+                if len(temp_prompt_main) == 0:
+                    break
+                elif len(temp_prompt_main) < MAX_CONTENT_TOKEN:
+                    prompts.append(Prompt(prompt_prefix=prompt.prompt_prefix,
+                                          prompt_main=temp_prompt_main,
+                                          prompt_suffix=prompt.prompt_suffix))
+                    break
+                else:
+                    first, last = get_first_n_tokens_and_remaining(temp_prompt_main, MAX_CONTENT_TOKEN)
+                    temp_prompt_main = last
+                    prompts.append(Prompt(prompt_prefix=prompt.prompt_prefix,
+                                          prompt_main=first,
+                                          prompt_suffix=prompt.prompt_suffix))
+        else:
+            # A bit ugly to handle the timestamped version and non-timestamped version in this matter.
+            # But make a working software first.
+            paragraphs_split_by_timestamp = []
+            for sentence in prompt.prompt_main.split('\n'):
+                if sentence == "":
+                    continue
+                def is_start_with_timestamp(sentence):
+                    return sentence[0].isdigit() and (sentence[1] == ":" or sentence[2] == ":")
+                if is_start_with_timestamp(sentence):
+                    paragraphs_split_by_timestamp.append(sentence)
+                else:
+                    paragraphs_split_by_timestamp[-1] += sentence
+            def extract_timestamp(paragraph):
+                return paragraph.split(' ')[0]
+            def extract_minute(timestamp):
+                return int(timestamp.split(':')[0])
+            def append_prompt(prompt, prompts, temp_minute, temp_paragraph, temp_timestamp):
+                prompts.append(Prompt(prompt_prefix=prompt.prompt_prefix,
+                                      prompt_main=temp_paragraph,
+                                      prompt_suffix=prompt.prompt_suffix.format(first_timestamp=temp_timestamp,
+                                                                                second_minute=temp_minute + 2,
+                                                                                third_minute=temp_minute + 4)
+                                      # this formatting gives better result in one-shot learning / example.
+                                      # ie if it is the second+ splitted prompt, don't use 0:00 as the first timestamp example
+                                      #    use the exact first timestamp of the splitted prompt
+                                      ))
+            token_num_list = list(map(get_token, paragraphs_split_by_timestamp))  # e.g. [159, 160, 158, ..]
+            timestamp_list = list(map(extract_timestamp, paragraphs_split_by_timestamp))  # e.g. ['0:00', '0:32', '1:03' ..]
+            minute_list = list(map(extract_minute, timestamp_list))  # e.g. [0, 0, 1, ..]
+            accumulated_token_num, temp_paragraph, temp_timestamp, temp_minute = 0, "", timestamp_list[0], minute_list[0]
+            for i, paragraph in enumerate(paragraphs_split_by_timestamp):
+                curr_token_num = token_num_list[i]
+                if accumulated_token_num + curr_token_num > MAX_CONTENT_TOKEN:
+                    append_prompt(prompt, prompts, temp_minute, temp_paragraph, temp_timestamp)
+                    accumulated_token_num, temp_paragraph = 0, ""
+                    try:
+                        temp_timestamp, temp_minute = timestamp_list[i + 1], minute_list[i + 1]
+                    except IndexError:
+                        temp_timestamp, temp_minute = timestamp_list[i], minute_list[i]  # should be trivial. No more next part
+                else:
+                    temp_paragraph += paragraph + "\n"
+                    accumulated_token_num += curr_token_num
+            if accumulated_token_num > 0:  # add back remaining
+                append_prompt(prompt, prompts, temp_minute, temp_paragraph, temp_timestamp)
+        return prompts
+    @staticmethod
+    def single_call_chatgpt_with_handling(source_md, prompt_str: str, prompt_show_user: str, chatbot, api_key, gpt_model="gpt-3.5-turbo", history=[]):
+        """
+        Handling
+        - token exceeding -> split input
+        - timeout -> retry 2 times
+        - other error -> retry 2 times
+        """
+        TIMEOUT_SECONDS, MAX_RETRY = config['openai']['timeout_sec'], config['openai']['max_retry']
+        # When multi-threaded, you need a mutable structure to pass information between different threads
+        # list is the simplest mutable structure, we put gpt output in the first position, the second position to pass the error message
+        mutable_list = [None, '']  # [gpt_output, error_message]
+        # multi-threading worker
+        def mt(prompt_str, history):
+            while True:
+                try:
+                    mutable_list[0] = ChatGPTService.single_rest_call_chatgpt(api_key, prompt_str, gpt_model, history=history)
+                    break
+                except ConnectionAbortedError as token_exceeded_error:
+                    # # Try to calculate the ratio and keep as much text as possible
+                    # print(f'[Local Message] Token exceeded: {token_exceeded_error}.')
+                    # p_ratio, n_exceed = ChatGPTService.get_reduce_token_percent(str(token_exceeded_error))
+                    # if len(history) > 0:
+                    #     history = [his[int(len(his) * p_ratio):] for his in history if his is not None]
+                    # else:
+                    #     prompt_str = prompt_str[:int(len(prompt_str) * p_ratio)]
+                    # mutable_list[1] = f'Warning: text too long will be truncated. Token exceeded：{n_exceed}，Truncation ratio: {(1 - p_ratio):.0%}。'
+                    mutable_list[0] = TOKEN_EXCEED_MSG
+                except TimeoutError as e:
+                    mutable_list[0] = TIMEOUT_MSG
+                    raise TimeoutError
+                except Exception as e:
+                    mutable_list[0] = f'{provide_text_with_css("ERROR", "red")} Exception: {str(e)}.'
+                    raise RuntimeError(f'[ERROR] Exception: {str(e)}.')
+                # TODO retry
+        # Create a new thread to make http requests
+        thread_name = threading.Thread(target=mt, args=(prompt_str, history))
+        thread_name.start()
+        # The original thread is responsible for continuously updating the UI, implementing a timeout countdown, and waiting for the new thread's task to complete
+        cnt = 0
+        while thread_name.is_alive():
+            cnt += 1
+            is_append = False
+            if cnt == 1:
+                is_append = True
+            yield from ChatGPTService.say(prompt_show_user, f"""
+{provide_text_with_css("PROCESSING...", "blue")} {mutable_list[1]}waiting gpt response {cnt}/{TIMEOUT_SECONDS * 2 * (MAX_RETRY + 1)}{''.join(['.'] * (cnt % 4))}
+{mutable_list[0]}
+            """, chatbot, history, 'Normal', source_md, is_append)
+            time.sleep(1)
+        # Get the output of gpt out of the mutable
+        gpt_response = mutable_list[0]
+        if 'ERROR' in gpt_response:
+            raise Exception
+        return gpt_response
+    @staticmethod
+    def single_rest_call_chatgpt(api_key, prompt_str: str, gpt_model="gpt-3.5-turbo", history=[], observe_window=None):
+        """
+        Single call chatgpt only. No handling on multiple call (it should be in upper caller multi_call_chatgpt_with_handling())
+        - Support stream=True
+        - observe_window: used to pass the output across threads, most of the time just for the fancy visual effect, just leave it empty
+        - retry 2 times
+        """
+        headers, payload = LLMService.generate_payload(api_key, gpt_model, prompt_str, history, stream=True)
+        retry = 0
+        while True:
+            try:
+                # make a POST request to the API endpoint, stream=False
+                response = requests.post(config['openai']['api_url'], headers=headers,
+                                         json=payload, stream=True, timeout=config['openai']['timeout_sec']
+                                         )
+                break
+            except requests.exceptions.ReadTimeout as e:
+                max_retry = config['openai']['max_retry']
+                retry += 1
+                traceback.print_exc()
+                if retry > max_retry:
+                    raise TimeoutError
+                if max_retry != 0:
+                    print(f'Request timeout. Retrying ({retry}/{max_retry}) ...')
+        stream_response = response.iter_lines()
+        result = ''
+        while True:
+            try:
+                chunk = next(stream_response).decode()
+            except StopIteration:
+                break
+            if len(chunk) == 0: continue
+            if not chunk.startswith('data:'):
+                error_msg = LLMService.get_full_error(chunk.encode('utf8'), stream_response).decode()
+                if "reduce the length" in error_msg:
+                    raise ConnectionAbortedError("OpenAI rejected the request:" + error_msg)
+                else:
+                    raise RuntimeError("OpenAI rejected the request: " + error_msg)
+            json_data = json.loads(chunk.lstrip('data:'))['choices'][0]
+            delta = json_data["delta"]
+            if len(delta) == 0: break
+            if "role" in delta: continue
+            if "content" in delta:
+                result += delta["content"]
+                print(delta["content"], end='')
+                if observe_window is not None: observe_window[0] += delta["content"]
+            else:
+                raise RuntimeError("Unexpected Json structure: " + delta)
+        if json_data['finish_reason'] == 'length':
+            raise ConnectionAbortedError("Completed normally with insufficient Tokens")
+        return result
+if __name__ == '__main__':
+    import pickle
+    prompt: Prompt = pickle.load(open('prompt.pkl', 'rb'))
+    prompts = ChatGPTService.split_prompt_content(prompt, is_timestamp=True)
+    for prompt in prompts:
+        print("=====================================")
+        print(prompt.prompt_prefix)
+        print(prompt.prompt_main)
+        print(prompt.prompt_suffix)

digester/gradio_method_service.py ADDED Viewed

	@@ -0,0 +1,392 @@

+import json
+from everything2text4prompt.everything2text4prompt import Everything2Text4Prompt
+from everything2text4prompt.util import BaseData, YoutubeData, PodcastData
+from digester.chatgpt_service import LLMService, ChatGPTService
+from digester.util import Prompt, provide_text_with_css, GradioInputs
+WAITING_FOR_TARGET_INPUT = "Waiting for target source input"
+RESPONSE_SUFFIX = "⚡Powered by DigestEverythingGPT in github"
+class GradioMethodService:
+    """
+    GradioMethodService is defined as gradio functions
+    Therefore all methods here will fulfill
+    - gradio.inputs as signature
+    - gradio.outputs as return
+    Detailed-level methods called by methods in GradioMethodService will be in other classes (e.g. DigesterService)
+    """
+    @staticmethod
+    def write_results_to_file(history, file_name=None):
+        """
+        Writes the conversation history to a file in Markdown format.
+        If no filename is specified, the filename is generated using the current time.
+        """
+        import os, time
+        if file_name is None:
+            file_name = 'chatGPT_report' + time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + '.md'
+        os.makedirs('./analyzer_logs/', exist_ok=True)
+        with open(f'./analyzer_logs/{file_name}', 'w', encoding='utf8') as f:
+            f.write('# chatGPT report\n')
+            for i, content in enumerate(history):
+                try:
+                    if type(content) != str: content = str(content)
+                except:
+                    continue
+                if i % 2 == 0:
+                    f.write('## ')
+                f.write(content)
+                f.write('\n\n')
+        res = 'The above material has been written in ' + os.path.abspath(f'./analyzer_logs/{file_name}')
+        print(res)
+        return res
+    @staticmethod
+    def fetch_and_summarize(apikey_textbox, source_textbox, source_target_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history):
+        g_inputs = GradioInputs(apikey_textbox, source_textbox, source_target_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history)
+        g_inputs.history = []
+        g_inputs.chatbot = []
+        if g_inputs.apikey_textbox == "" or g_inputs.source_textbox == "" or g_inputs.source_target_textbox == "":
+            LLMService.report_exception(g_inputs.chatbot, g_inputs.history,
+                                        chat_input=f"Source target: [{g_inputs.source_textbox}] {g_inputs.source_target_textbox}",
+                                        chat_output=f"{provide_text_with_css('ERROR', 'red')} Please provide api key, source and target source")
+            yield g_inputs.chatbot, g_inputs.history, 'Error', WAITING_FOR_TARGET_INPUT
+            return
+        # TODO: invalid input checking
+        is_success, text_data = yield from DigesterService.fetch_text(g_inputs)
+        if not is_success:
+            return  # TODO: error handling testing
+        yield from PromptEngineeringStrategy.execute_prompt_chain(g_inputs, text_data)
+    @staticmethod
+    def ask_question(apikey_textbox, source_textbox, target_source_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history):
+        g_inputs = GradioInputs(apikey_textbox, source_textbox, target_source_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history)
+        msg = f"ask_question(`{qa_textbox}`)"
+        g_inputs.chatbot.append(("test prompt query", msg))
+        yield g_inputs.chatbot, g_inputs.history, 'Normal'
+    @staticmethod
+    def test_formatting(apikey_textbox, source_textbox, target_source_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history):
+        g_inputs = GradioInputs(apikey_textbox, source_textbox, target_source_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history)
+        msg = r"""
+# ASCII, table, code test
+Overall, this program consists of the following files:
+- `main.py`: This is the primary script of the program which uses NLP to analyze and summarize Python code.
+- `model.py`: This file defines the `CodeModel` class that is used by `main.py` to model the code as graphs and performs operations on them.
+- `parser.py`: This file contains custom parsing functions used by `model.py`.
+- `test/`: This directory contains test scripts for `model.py` and `util.py`
+- `util.py`: This file provides utility functions for the program such as getting the root directory of the project and reading configuration files.
+`util.py` specifically has two functions:
+| Function | Input | Output | Functionality |
+|----------|-------|--------|---------------|
+| `get_project_root()` | None | String containing the path of the parent directory of the script itself | Finds the path of the parent directory of the script itself |
+| `get_config()` | None | Dictionary containing the contents of `config.yaml` and `config_secret.yaml`, merged together (with `config_secret.yaml` overwriting any keys with the same name in `config.yaml`) | Reads and merges two YAML configuration files (`config.yaml` and `config_secret.yaml`) located in the `config` directory in the parent directory of the script. Returns the resulting dictionary. |The above material has been written in C:\github\!CodeAnalyzerGPT\CodeAnalyzerGPT\analyzer_logs\chatGPT_report2023-04-07-14-11-55.md
+The Hessian matrix is a square matrix that contains information about the second-order partial derivatives of a function. Suppose we have a function $f(x_1,x_2,...,x_n)$ which is twice continuously differentiable. Then the Hessian matrix $H(f)$ of $f$ is defined as the $n\times n$ matrix:
+$$H(f) = \begin{bmatrix} \frac{\partial^2 f}{\partial x_1^2} & \frac{\partial^2 f}{\partial x_1 \partial x_2} & \cdots & \frac{\partial^2 f}{\partial x_1 \partial x_n} \ \frac{\partial^2 f}{\partial x_2 \partial x_1} & \frac{\partial^2 f}{\partial x_2^2} & \cdots & \frac{\partial^2 f}{\partial x_2 \partial x_n} \ \vdots & \vdots & \ddots & \vdots \ \frac{\partial^2 f}{\partial x_n \partial x_1} & \frac{\partial^2 f}{\partial x_n \partial x_2} & \cdots & \frac{\partial^2 f}{\partial x_n^2} \ \end{bmatrix}$$
+Each element in the Hessian matrix is the second-order partial derivative of the function with respect to a pair of variables, as shown in the matrix above
+Here's an example Python code using SymPy module to get the derivative of a mathematical function:
+```
+import sympy as sp
+x = sp.Symbol('x')
+f = input('Enter a mathematical function in terms of x: ')
+expr = sp.sympify(f)
+dfdx = sp.diff(expr, x)
+print('The derivative of', f, 'is:', dfdx)
+```
+This code will prompt the user to enter a mathematical function in terms of x and then use the `diff()` function from SymPy to calculate its derivative with respect to x. The result will be printed on the screen.
+# Non-ASCII test
+程序整体功能：CodeAnalyzerGPT工程是一个用于自动化代码分析和评审的工具。它使用了OpenAI的GPT模型对代码进行分析，然后根据一定的规则和标准来评价代码的质量和合规性。
+程序的构架包含以下几个模块：
+1. CodeAnalyzerGPT: 主程序模块，包含了代码分析和评审的主要逻辑。
+2. analyzer: 包含了代码分析程序的具体实现。
+每个文件的功能可以总结为下表：
+| 文件名 | 功能描述 |
+| --- | --- |
+| C:\github\!CodeAnalyzerGPT\CodeAnalyzerGPT\CodeAnalyzerGPT.py | 主程序入口，调用各种处理逻辑和输出结果 |
+| C:\github\!CodeAnalyzerGPT\CodeAnalyzerGPT\analyzer\code_analyzer.py | 代码分析器，包含了对代码文本的解析和分析逻辑 |
+| C:\github\!CodeAnalyzerGPT\CodeAnalyzerGPT\analyzer\code_segment.py | 对代码文本进行语句和表达式的分段处理 |
+    """
+        g_inputs.chatbot.append(("test prompt query", msg))
+        yield g_inputs.chatbot, g_inputs.history, 'Normal'
+    @staticmethod
+    def test_asking(apikey_textbox, source_textbox, target_source_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history):
+        g_inputs = GradioInputs(apikey_textbox, source_textbox, target_source_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history)
+        msg = f"test_ask(`{qa_textbox}`)"
+        g_inputs.chatbot.append(("test prompt query", msg))
+        g_inputs.chatbot.append(("test prompt query 2", msg))
+        g_inputs.chatbot.append(("", "test empty message"))
+        g_inputs.chatbot.append(("test empty message 2", ""))
+        g_inputs.chatbot.append((None, "output msg, test no input msg"))
+        g_inputs.chatbot.append(("input msg, , test no output msg", None))
+        g_inputs.chatbot.append((None, '<span style="background-color: yellow; color: black; padding: 3px; border-radius: 8px;">WARN</span>'))
+        yield g_inputs.chatbot, g_inputs.history, 'Normal'
+class DigesterService:
+    @staticmethod
+    def update_ui(chatbot_input, chatbot_output, status, target_md, chatbot, history, is_append=True):
+        """
+        For instant chatbot_input+output
+        Not suitable if chatbot_output have delay / processing time
+        """
+        if is_append:
+            chatbot.append((chatbot_input, chatbot_output))
+        else:
+            chatbot[-1] = (chatbot_input, chatbot_output)
+        history.append(chatbot_input)
+        history.append(chatbot_output)
+        yield chatbot, history, status, target_md
+    @staticmethod
+    def fetch_text(g_inputs: GradioInputs) -> (bool, BaseData):
+        """Fetch text from source using everything2text4prompt. No OpenAI call here"""
+        converter = Everything2Text4Prompt(openai_api_key=g_inputs.apikey_textbox)
+        text_data, is_success, error_msg = converter.convert_text(g_inputs.source_textbox, g_inputs.source_target_textbox)
+        text_content = text_data.full_content
+        chatbot_input = f"Converting source to text for [{g_inputs.source_textbox}] {g_inputs.source_target_textbox} ..."
+        target_md = f"[{g_inputs.source_textbox}] {g_inputs.source_target_textbox}"
+        if is_success:
+            chatbot_output = f"""
+Extracted text successfully:
+{text_content}
+            """
+            yield from DigesterService.update_ui(chatbot_input, chatbot_output, "Success", target_md, g_inputs.chatbot, g_inputs.history)
+        else:
+            chatbot_output = f"""
+{provide_text_with_css("ERROR", "red")} Text extraction failed ({error_msg})
+            """
+            yield from DigesterService.update_ui(chatbot_input, chatbot_output, "Error", target_md, g_inputs.chatbot, g_inputs.history)
+        return is_success, text_data
+class PromptEngineeringStrategy:
+    @staticmethod
+    def execute_prompt_chain(g_inputs: GradioInputs, text_data: BaseData):
+        if g_inputs.source_textbox == 'youtube':
+            yield from PromptEngineeringStrategy.execute_prompt_chain_youtube(g_inputs, text_data)
+        elif g_inputs.source_textbox == 'podcast':
+            yield from PromptEngineeringStrategy.execute_prompt_chain_podcast(g_inputs, text_data)
+    @staticmethod
+    def execute_prompt_chain_youtube(g_inputs: GradioInputs, text_data: YoutubeData):
+        yield from YoutubeChain.execute_chain(g_inputs, text_data)
+    @staticmethod
+    def execute_prompt_chain_podcast(g_inputs: GradioInputs, text_data: PodcastData):
+        pass
+class Chain:
+    @staticmethod
+    def execute_chain(g_inputs: GradioInputs, text_data: YoutubeData):
+        raise NotImplementedError
+class YoutubeChain(Chain):
+    CLASSIFIER_PROMPT = Prompt(
+        prompt_prefix="""
+[Youtube Video types]
+N things: The youtube will shows N items that will be described in the video. For example "17 cheap purchases that save me time", "10 AMAZING Ways AutoGPT Is Being Used RIGHT NOW". Usually the title starts with a number.
+Tutorials: how to do or make something in order to teach a skill or how to use a product or software
+How-to and DIY: People show how to make or do something yourself, like crafts, recipes, projects, etc
+Interview: Interviewee shows their standpoint with a topic.
+Others: If the video type is not listed above
+[TITLE]
+{title}
+[TRANSCRIPT]
+""",
+        prompt_main="""
+{transcript}
+""",
+        prompt_suffix="""
+[TASK]
+From the above title, transcript, classify the youtube video type listed above.
+Give the video type with JSON format like {"type": "N things"}, and exclude other text.
+""")
+    TIMESTAMPED_SUMMARY_PROMPT = Prompt(
+        prompt_prefix="""
+[TITLE]
+{title}
+[Transcript with timestamp]
+""",
+        prompt_main="""
+{transcript_with_ts}
+""",
+        prompt_suffix="""
+[TASK]
+Convert this into youtube summary.
+Separate for 2-5minutes chunk, maximum 20 words for one line.
+Start with the timestamp followed by the summarized text for that chunk.
+Must use language: {language}
+Example format:
+{first_timestamp} - This is the first part
+{second_minute}:44 - This is the second part
+{third_minute}:02 - This is the third part
+""")
+    FINAL_SUMMARY_PROMPT = Prompt(
+        prompt_prefix="""
+[VIDEO_TYPE]
+This is the video type
+N things: The youtube will shows N items that will be described in the video. For example "17 cheap purchases that save me time", "10 AMAZING Ways AutoGPT Is Being Used RIGHT NOW"
+Tutorials: how to do or make something in order to teach a skill or how to use a product or software
+[TITLE]
+{title}
+[TRANSCRIPT]
+""",
+        prompt_main="""
+{transcript}
+""",
+        prompt_suffix="""
+[TASK]
+Summarize the above transcript. Step by step showing points for the main concepts.
+Use markdown format.
+Must use language: {language}
+{task_constraint}
+The format is like:
+Summary: (content of summary)
+{format_constraint}
+""")
+    FINAL_SUMMARY_TASK_CONSTRAINTS = {
+        "N things": """
+Additionally, since it is a N things video, the summary should include the N items stated in the video.
+""",
+        "Tutorials": """
+Additionally, since it is a Tutorial video, provide step by step instructions for the tutorial.
+""",
+    }
+    FINAL_SUMMARY_FORMAT_CONSTRAINTS = {
+        "N things": """
+Items mentioned in the video: (content of N things)
+""",
+        "Tutorials": """
+Instructions: (step by step instructions)
+""",
+    }
+    @staticmethod
+    def execute_chain(g_inputs: GradioInputs, text_data: YoutubeData):
+        text_content = text_data.full_content
+        timestamped_summary = yield from YoutubeChain.execute_timestamped_summary_chain(g_inputs, text_data)
+        video_type = yield from YoutubeChain.execute_classifer_chain(g_inputs, text_data)
+        final_summary = yield from YoutubeChain.execute_final_summary_chain(g_inputs, text_data, video_type)
+        full_summary = f"""
+{provide_text_with_css("DONE", "green")}
+Video: {text_data.title}
+# Timestamped summary
+{timestamped_summary}
+# Summary
+{final_summary}
+{RESPONSE_SUFFIX}
+        """
+        prompt_show_user = "Full summary"
+        g_inputs.chatbot[-1] = (prompt_show_user, full_summary)
+        g_inputs.history.append(prompt_show_user)
+        g_inputs.history.append(full_summary)
+        yield g_inputs.chatbot, g_inputs.history, "Success", f"[{g_inputs.source_textbox}] {g_inputs.source_target_textbox}"
+    @classmethod
+    def execute_classifer_chain(cls, g_inputs: GradioInputs, youtube_data: YoutubeData):
+        TRANSCRIPT_CHAR_LIMIT = 200  # Because classifer don't need to see the whole transcript
+        prompt = Prompt(cls.CLASSIFIER_PROMPT.prompt_prefix.format(title=youtube_data.title),
+                        cls.CLASSIFIER_PROMPT.prompt_main.format(transcript=youtube_data.full_content[:TRANSCRIPT_CHAR_LIMIT]),
+                        cls.CLASSIFIER_PROMPT.prompt_suffix
+                        )
+        prompt_show_user = "Classify the video type for me"
+        response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs)
+        try:
+            video_type = json.loads(response)['type']
+        except Exception as e:
+            # TODO: Exception handling, show error in UI
+            video_type = 'Others'
+        return video_type
+    @classmethod
+    def execute_timestamped_summary_chain(cls, g_inputs: GradioInputs, youtube_data: YoutubeData):
+        transcript_with_ts = ""
+        for entry in youtube_data.ts_transcript_list:
+            transcript_with_ts += f"{int(entry['start'] // 60)}:{int(entry['start'] % 60):02d} {entry['text']}\n"
+        prompt = Prompt(cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title),
+                        cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_main.format(transcript_with_ts=transcript_with_ts),
+                        cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_suffix.replace("{language}", g_inputs.language_textbox)
+                        )
+        prompt_show_user = "Generate the timestamped summary"
+        response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs, is_timestamp=True)
+        return response
+    @classmethod
+    def execute_final_summary_chain(cls, g_inputs: GradioInputs, youtube_data: YoutubeData, video_type):
+        if video_type in cls.FINAL_SUMMARY_TASK_CONSTRAINTS.keys():
+            task_constraint = cls.FINAL_SUMMARY_TASK_CONSTRAINTS[video_type]
+            format_constraint = cls.FINAL_SUMMARY_FORMAT_CONSTRAINTS[video_type]
+        else:
+            task_constraint, format_constraint = "", ""
+        prompt = Prompt(
+            cls.FINAL_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title),
+            cls.FINAL_SUMMARY_PROMPT.prompt_main.format(transcript=youtube_data.full_content),
+            cls.FINAL_SUMMARY_PROMPT.prompt_suffix.format(task_constraint=task_constraint, format_constraint=format_constraint, language=g_inputs.language_textbox)
+        )
+        prompt_show_user = "Generate the final summary"
+        response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs)
+        if len_prompts > 1:
+            # Give summary of summaries if the video is long
+            prompt = Prompt(
+                cls.FINAL_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title),
+                cls.FINAL_SUMMARY_PROMPT.prompt_main.format(transcript=response),
+                cls.FINAL_SUMMARY_PROMPT.prompt_suffix.format(task_constraint=task_constraint, format_constraint=format_constraint, language=g_inputs.language_textbox)
+            )
+            prompt_show_user = "Since the video is long, generating the final summary of the summaries"
+            response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs)
+        return response
+if __name__ == '__main__':
+    GPT_MODEL = "gpt-3.5-turbo"
+    API_KEY = ""
+    input_1 = """Give me 2 ideas for the summer"""
+    # input_1 = """Explain more on the first idea"""
+    response_1 = ChatGPTService.single_rest_call_chatgpt(API_KEY, input_1, GPT_MODEL)
+    print(response_1)
+    input_2 = """
+For the first idea, suggest some step by step planning for me
+    """
+    response_2 = ChatGPTService.single_rest_call_chatgpt(API_KEY, input_2, GPT_MODEL, history=[input_1, response_1])
+    print(response_2)

digester/gradio_ui_service.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import gradio as gr
+import markdown
+from digester.gradio_method_service import GradioMethodService
+title_html = "<h1 align=\"center\">DigestEverythingGPT</h1>"
+cancel_handles = []
+class GradioUIService:
+    @staticmethod
+    def get_functions():
+        functions = {
+            "Fetch and summarize!": {
+                "function": GradioMethodService.fetch_and_summarize,
+            },
+            "Ask": {
+                "function": GradioMethodService.ask_question
+            },
+            "Test formatting": {
+                "function": GradioMethodService.test_formatting
+            },
+            "Test asking": {
+                "function": GradioMethodService.test_asking
+            },
+        }
+        return functions
+    @staticmethod
+    def post_define_functions(functions, folder_md):
+        """Append extra gradio objects to functions after creating gradio objects"""
+        functions["Fetch and summarize!"]["extra_outputs"] = [folder_md]
+        return functions
+    @staticmethod
+    def get_gradio_ui():
+        def get_extra_outputs(functions, fn_key):
+            if functions[fn_key].get('extra_outputs'):
+                return functions[fn_key]['extra_outputs']
+            return []
+        # gr.Chatbot.postprocess = GradioUIService.format_io
+        functions = GradioUIService.get_functions()
+        with gr.Blocks(theme=GradioUIService.get_theme(), css=GradioUIService.get_css()) as demo:
+            gr.HTML(title_html)
+            with gr.Row().style(equal_height=True):
+                with gr.Column(scale=1):
+                    with gr.Row():
+                        apikey_textbox = gr.Textbox(label="OpenAI API key", placeholder="e.g. sk-xxxxx", css_class="api-key")
+                    with gr.Row():
+                        source_textbox = gr.Dropdown(
+                            ["youtube", "podcast (not support now)", "pdf (not support now)"],
+                            value="youtube", label="Source", info="Choose your content provider"
+                            # TODO: dynamic list from everything2text4prompt
+                        )
+                    with gr.Row():
+                        source_target_textbox = gr.Textbox(show_label=True, label="URL / source target",
+                                                           placeholder="For youtube video, give video id\nFor podcast, give podcast URL")
+                    with gr.Accordion("Options", open=True):
+                        with gr.Row():
+                            gpt_model_textbox = gr.Dropdown(
+                                ["gpt-3.5-turbo", "gpt-4"],
+                                value="gpt-3.5-turbo", label="GPT model", info="gpt-3.5 is cheaper.\nBut if you found that the result is not good, try gpt-4 \nYour API key must support gpt-4"
+                            )
+                        with gr.Row():
+                            language_textbox = gr.Dropdown(
+                                ["en-US", "zh-CN", "zh-TW", "it-IT", "fr-FR", "de-DE", "es-ES", "ja-JP", "ko-KR", "ru-RU", ],
+                                value="en-US", label="Language", info="Choose your language, regardless of video language"
+                            )
+                    with gr.Row():
+                        functions["Fetch and summarize!"]["btn"] = gr.Button("Fetch and summarize!", variant="primary")
+                    with gr.Row().style(equal_height=True):
+                        gr.Markdown(f"Status: ")
+                        status_md = gr.Markdown(f"Normal")
+                    with gr.Row():
+                        folder_md = gr.Markdown(f"Waiting for source target input")
+                    with gr.Row():
+                        qa_textbox = gr.Textbox(show_label=False, placeholder="Ask questions").style(container=False)
+                    with gr.Row():
+                        functions["Ask"]["btn"] = gr.Button("Ask", variant="primary")
+                    with gr.Row():
+                        reset_btn = gr.Button("Reset", variant="secondary")
+                        reset_btn.style(size="sm")
+                        stop_btn = gr.Button("Stop", variant="secondary")
+                        stop_btn.style(size="sm")
+                    with gr.Accordion("debug", open=True):
+                        with gr.Row():
+                            functions["Test formatting"]["btn"] = gr.Button("Test formatting")
+                            functions["Test asking"]["btn"] = gr.Button("Test asking")
+                with gr.Column(scale=3):
+                    chatbot = gr.Chatbot()
+                    chatbot.style(height=1100)
+                    history = gr.State([])
+            # after creating gradio objects, append to functions to centralize things.
+            functions = GradioUIService.post_define_functions(functions, folder_md)
+            #### handle click(=submit) and cancel behaviour
+            # Standard inputs/outputs (global for all actions)
+            inputs = [apikey_textbox, source_textbox, source_target_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history]
+            outputs = [chatbot, history, status_md]
+            # fetch_and_summarize_textbox
+            fn_key = "Fetch and summarize!"
+            analyze_code_base_args = dict(fn=functions[fn_key]["function"], inputs=inputs, outputs=[*outputs, folder_md])
+            cancel_handles.append(source_target_textbox.submit(**analyze_code_base_args))
+            # qa_textbox
+            fn_key = "Ask"
+            ask_args = dict(fn=functions[fn_key]["function"], inputs=inputs, outputs=outputs)
+            cancel_handles.append(qa_textbox.submit(**ask_args))
+            # all buttons
+            for fn_key in functions:
+                click_handle = functions[fn_key]["btn"].click(fn=functions[fn_key]["function"],
+                                                              inputs=inputs, outputs=[*outputs, *get_extra_outputs(functions, fn_key)])
+                cancel_handles.append(click_handle)
+            stop_btn.click(fn=None, inputs=None, outputs=None, cancels=cancel_handles)
+            reset_btn.click(fn=lambda: ([], [], "Already reset"), inputs=None, outputs=outputs)
+        demo.title = "DigestEverythingGPT"
+        return demo
+    def format_io(self, y):
+        """
+        Convert the input and output to HTML format.
+            Paragraphize the input part of the last item in y,
+            and convert the Markdown and mathematical formula in the output part to HTML format.
+        """
+        def text_divide_paragraph(text):
+            """
+            Separate the text according to the paragraph separator and generate HTML code with paragraph tags.
+            """
+            if '```' in text:
+                return text
+            else:
+                lines = text.split("\n")
+                for i, line in enumerate(lines):
+                    lines[i] = lines[i].replace(" ", "&nbsp;")
+                text = "</br>".join(lines)
+                return text
+        def close_up_code_segment_during_stream(gpt_reply):
+            """
+            Handling when the GPT output is cut in half
+            Add '```' at the end of the output if the output is not complete.
+            """
+            # guard pattern for normal cases
+            if '```' not in gpt_reply:
+                return gpt_reply
+            if gpt_reply.endswith('```'):
+                return gpt_reply
+            # otherwise
+            segments = gpt_reply.split('```')
+            n_mark = len(segments) - 1
+            if n_mark % 2 == 1:
+                return gpt_reply + '\n```'
+            else:
+                return gpt_reply
+        def markdown_convertion(txt):
+            """
+            Convert markdown text to HTML format
+            """
+            pre = '<div class="markdown-body">'
+            suf = '</div>'
+            # if ('$' in txt) and ('```' not in txt):
+            #     return pre + markdown.markdown(txt, extensions=['fenced_code', 'tables']) + '<br><br>' + markdown.markdown(convert_math(txt, splitParagraphs=False),
+            #                                                                                                                extensions=['fenced_code', 'tables']) + suf
+            # else:
+            #     return pre + markdown.markdown(txt, extensions=['fenced_code', 'tables']) + suf
+            return pre + markdown.markdown(txt, extensions=['fenced_code', 'tables']) + suf
+        if y is None or y == []: return []
+        i_ask, gpt_reply = y[-1]
+        i_ask = text_divide_paragraph(i_ask)
+        gpt_reply = close_up_code_segment_during_stream(gpt_reply)
+        # y[-1] = (
+        #     None if i_ask is None else markdown.markdown(i_ask, extensions=['fenced_code', 'tables']),
+        #     None if gpt_reply is None else markdown_convertion(gpt_reply)
+        # )
+        return y
+    @staticmethod
+    def get_theme():
+        try:
+            set_theme = gr.themes.Default(
+                primary_hue=gr.themes.utils.colors.cyan,
+                neutral_hue=gr.themes.utils.colors.gray,
+                font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif", ],
+                font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "Consolas", "ui-monospace", "monospace"]
+            )
+        except Exception as e:
+            set_theme = None
+            print(f'please upgrade to newer version of gradio {e}')
+        return set_theme
+    @staticmethod
+    def get_css():
+        css = """
+/* Set the outer margins of the table to 1em, merge the borders between internal cells, and display empty cells. */
+.markdown-body table {
+    margin: 1em 0;
+    border-collapse: collapse;
+    empty-cells: show;
+}
+/* Set the inner margin of the table cell to 5px, the border thickness to 1.2px, and the color to --border-color-primary. */
+.markdown-body th, .markdown-body td {
+    border: 1.2px solid var(--border-color-primary);
+    padding: 5px;
+}
+/* Set the table header background color to rgba(175,184,193,0.2) and transparency to 0.2. */
+.markdown-body thead {
+    background-color: rgba(175,184,193,0.2);
+}
+/* Set the padding of the table header cell to 0.5em and 0.2em. */
+.markdown-body thead th {
+    padding: .5em .2em;
+}
+/* Remove the default padding of the list prefix to align it with the text line. */
+.markdown-body ol, .markdown-body ul {
+    padding-inline-start: 2em !important;
+}
+/* Set the style of the chat bubble, including the radius, the maximum width, and the shadow. */
+[class *= "message"] {
+    border-radius: var(--radius-xl) !important;
+    /* padding: var(--spacing-xl) !important; */
+    /* font-size: var(--text-md) !important; */
+    /* line-height: var(--line-md) !important; */
+    /* min-height: calc(var(--text-md)*var(--line-md) + 2*var(--spacing-xl)); */
+    /* min-width: calc(var(--text-md)*var(--line-md) + 2*var(--spacing-xl)); */
+}
+[data-testid = "bot"] {
+    max-width: 95%;
+    /* width: auto !important; */
+    border-bottom-left-radius: 0 !important;
+}
+[data-testid = "user"] {
+    max-width: 100%;
+    /* width: auto !important; */
+    border-bottom-right-radius: 0 !important;
+}
+/* Set the background of the inline code to light gray, set the radius and spacing. */
+.markdown-body code {
+    font-family: 'JetBrains Mono', monospace;
+    display: inline;
+    white-space: break-spaces;
+    border-radius: 6px;
+    margin: 0 2px 0 2px;
+    padding: .2em .4em .1em .4em;
+    background-color: rgba(175,184,193,0.2);
+}
+/* Set the style of the code block, including the background color, the inner and outer margins, and the radius. */
+.markdown-body pre code {
+    font-family: 'JetBrains Mono', monospace;
+    display: block;
+    overflow: auto;
+    white-space: pre;
+    background-color: rgba(175,184,193,0.2);
+    border-radius: 10px;
+    padding: 1em;
+    margin: 1em 2em 1em 0.5em;
+}
+"""
+        return css

digester/test_chatgpt.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from chatgpt_service import ChatGPTService
+from everything2text4prompt.everything2text4prompt import Everything2Text4Prompt
+from everything2text4prompt.util import BaseData, YoutubeData, PodcastData
+from gradio_method_service import YoutubeChain, GradioInputs
+from digester.util import get_config, Prompt
+import json
+if __name__ == '__main__':
+    config = get_config()
+    api_key = config.get("openai").get("api_key")
+    assert api_key
+    gradio_inputs = GradioInputs(apikey_textbox=api_key, source_textbox="", source_target_textbox="", qa_textbox="", chatbot=[], history=[])
+    prompt_str = """
+[[[[[INPUT]]]]]
+[TITLE]
+8 Surprising Habits That Made Me A Millionaire
+[Transcript with timestamp]
+6:42 "Hey, let's do everything ourselves." That brings us on to habit number six which is to make friends
+with people in real life and more importantly,
+well, not more importantly, but additionally, on the internet. And the single best way I find
+for doing this is Twitter. Twitter is an incredible,
+incredible, incredible invention that you can use to make friends with people all around the world. And the nice thing about Twitter is that it's different to Instagram. Instagram is very sort of visual and based on posting pretty pictures, but Twitter is very much
+based on sharing good ideas. And if you are sharing interesting ideas and you're connecting with other people who are sharing those similar ideas, that automatically leads you
+7:13 to kind of becoming internet friends, and then they follow you, you follow them, you chat a little bit in the DMs. And over the last year, I've
+met up with so many people who I initially met on Twitter. And I've got friends all around the world who I've never ever met in real life, but we've talked on Twitter. We know we liked the same stuff. We share the same ideas. And, A, this just makes
+life much more fun. But if we're talking about habits to get to becoming a millionaire, I can point to lots of
+these different connections that have really accelerated
+the growth of my business. For example, me and my mate Thomas Frank became friends on Twitter
+like two weeks ago. Thomas Frank then
+introduced me to Standard
+7:44 which is the YouTuber
+agency that I'm now part of and that completely changed
+the game for my business. Secondly, there's two chaps,
+Tiago Forte and David Perell who run their own online courses. We became friends on Twitter
+after I took their courses and started engaging with them on Twitter. And then I DMed them when I wanted help for my own Part-Time YouTuber Academy and they really helped with that. And again, that really accelerated the growth of the business to becoming a $2 million business. And when it comes to this
+making friends thing, it's one of those things
+that's very hard to like, if you make friends with someone,
+then it will lead to this. It's more like you have
+this general habit, this general attitude
+towards making friends with whoever shares the same ideas as you
+8:16 and just generally trying to
+be a nice and helpful person, and you know that, eventually, that'll lead to really interesting things happening in your life
+further down the line. On a somewhat related note, habit number seven is reading a lot. And just like we can get
+wisdom from our real life and our internet friends via Twitter, we can get a lot more wisdom from people who have
+written books about stuff. You know, if you speak to anyone who's successful in almost any way, they will almost always say
+that they read a lot of books. And they will also almost always say that everyone else that
+they know who's successful also reads a lot of books.
+8:47 So if you're telling yourself,
+"I don't have time to read," then you're kind of screwing yourself because (laughs) basically
+every millionaire you ask will have spent tonnes and
+tonnes of time reading books. And again, the great thing about books is that you've got five,
+10, 20 years of experience that someone has boiled down to a thing that takes you
+a few hours to read. Like Tim Ferriss was doing
+the entrepreneurial thing for 10 years before he wrote the book. That's pretty sick. That's 10 years of wisdom that
+you can read in a few hours. And if you read lots of books
+of this or entrepreneurship, like business, finance,
+9:19 basically anything you're interested in, you can just get a huge
+amount of value from them. And it doesn't really cost very much. You can find PDFs on the internet for free if you're really averse
+to paying for books if that's your vibe. And it's just such a great way
+to accelerate your learning in almost anything. If you didn't know, I
+am also writing a book, which is probably gonna
+come out in two years' time. But I'll put a link to my
+book mailing list newsletter, which is where I share my book journey and what it's like to
+write and research a book and sample chapters and getting the audience's
+opinion and stuff. So that'll be linked in
+the video description if you wanna check it out. And finally, habit number eight
+for becoming a millionaire is to acquire financial literacy.
+9:51 Now, this is one of those things that no one teaches us in
+school or university or college, but it's just one of those things that you have to learn for yourself. And you can get it through reading books, such as, for example, this book, oh crap, "The Psychology of
+Money" by Morgan Housel, which is now a little bit dilapidated. I read this recently. It's
+really, really, really good. 20 bite-sized lessons about money. Gonna make a video about that. But also just generally
+taking your financial life into your own hands. I know so many people who have sort of relegated their financial
+life to, you know, "Oh, it's just something that
+the government will sort out."
+10:22 Or "Oh, you know, my hospital "will figure out what taxes I need to pay "and then I'll just kind
+of do it from there." Money is such an important part of life. It's one of the biggest sources
+of stress in anyone's life if you don't have much of it. And so much of our life is
+spent in the pursuit of money and financial freedom,
+financial independence, that if we don't have financial literacy, if we don't understand the
+basics of saving or investing or how the stock market
+works or how taxes work, any of that kind of stuff, again, we are just screwing ourselves. Because if you wanna become a millionaire you have to have some
+level of financial literacy to know what it takes
+to become a millionaire and how that might actually work.
+10:53 So recommend reading a book like "The Psychology of
+Money by Morgan Housel. Or, if you like, you can check
+out this video over here, which is my ultimate guide to investing in stocks and shares. That's like a half an
+hour-long crash course on everything you need
+to know about investing. If you don't know about investing definitely check out that video. Thank you so much for watching. Hope you found this video useful. And I will see you in
+the next one. Bye-bye.
+[TASK]
+Convert this into youtube summary.
+Separate for 2-5minutes chunk, maximum 20 words for one line.
+Start with the timestamp followed by the summarized text for that chunk.
+Example format:
+6:42 - This is the first part
+8:00 - This is the second part
+9:22 - This is the third part
+    """
+    GPT_MODEL = "gpt-3.5-turbo"
+    ChatGPTService.single_rest_call_chatgpt(api_key, prompt_str, GPT_MODEL)

digester/test_youtube_chain.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from chatgpt_service import ChatGPTService
+from everything2text4prompt.everything2text4prompt import Everything2Text4Prompt
+from everything2text4prompt.util import BaseData, YoutubeData, PodcastData
+from gradio_method_service import YoutubeChain, GradioInputs
+from digester.util import get_config, Prompt
+import json
+class VideoExample:
+    def __init__(self, title, description, transcript):
+        self.title = title
+        self.description = description
+        self.transcript = transcript
+    @classmethod
+    def get_youtube_data(cls, api_key: str, video_id: str):
+        converter = Everything2Text4Prompt(openai_api_key=api_key)
+        text_data, is_success, error_msg = converter.convert_text("youtube", video_id)
+        text_data: YoutubeData
+        title = text_data.title
+        description = text_data.description
+        transcript = text_data.full_content
+        ts_transcript_list = text_data.ts_transcript_list
+        return YoutubeData(transcript, title, description, ts_transcript_list)
+    @staticmethod
+    def get_nthings_10_autogpt():
+        video_id = "lSTEhG021Jc"
+        return VideoExample.get_youtube_data("", video_id)
+    @staticmethod
+    def get_nthings_7_lifelesson():
+        video_id = "CUPe_TZECQQ"
+        return VideoExample.get_youtube_data("", video_id)
+    @staticmethod
+    def get_nthings_8_habits():
+        video_id = "IScN1SOcj7A"
+        return VideoExample.get_youtube_data("", video_id)
+    @staticmethod
+    def get_tutorial_skincare():
+        video_id = "OrElyY7MFVs"
+        return VideoExample.get_youtube_data("", video_id)
+class YoutubeTestChain:
+    def __init__(self, api_key: str, gpt_model="gpt-3.5-turbo"):
+        self.api_key = api_key
+        self.gpt_model = gpt_model
+    def run_testing_chain(self):
+        input_1 = """Give me 2 ideas for the summer"""
+        # input_1 = """Explain more on the first idea"""
+        response_1 = ChatGPTService.single_rest_call_chatgpt(self.api_key, input_1, self.gpt_model)
+        input_2 = """
+    For the first idea, suggest some step by step planning for me
+        """
+        response_2 = ChatGPTService.single_rest_call_chatgpt(self.api_key, input_2, self.gpt_model, history=[input_1, response_1])
+    def test_youtube_classifier(self, gradio_inputs: GradioInputs, youtube_data: YoutubeData):
+        iter = YoutubeChain.execute_classifer_chain(gradio_inputs, youtube_data)
+        while True:
+            next(iter)
+    def test_youtube_timestamped_summary(self, gradio_inputs: GradioInputs, youtube_data: YoutubeData):
+        iter = YoutubeChain.execute_timestamped_summary_chain(gradio_inputs, youtube_data)
+        while True:
+            next(iter)
+    def test_youtube_final_summary(self, gradio_inputs: GradioInputs, youtube_data: YoutubeData, video_type):
+        iter = YoutubeChain.execute_final_summary_chain(gradio_inputs, youtube_data, video_type)
+        while True:
+            next(iter)
+if __name__ == '__main__':
+    config = get_config()
+    api_key = config.get("openai").get("api_key")
+    assert api_key
+    gradio_inputs = GradioInputs(apikey_textbox=api_key, source_textbox="", source_target_textbox="", qa_textbox="", chatbot=[], history=[])
+    youtube_data: YoutubeData = VideoExample.get_nthings_8_habits()
+    youtube_test_chain = YoutubeTestChain(api_key)
+    # youtube_test_chain.test_youtube_classifier(gradio_inputs, youtube_data)
+    youtube_test_chain.test_youtube_timestamped_summary(gradio_inputs, youtube_data)
+    # video_type = "N things"
+    # video_type = "Tutorials"
+    # video_type = "Others"
+    # youtube_test_chain.test_youtube_final_summary(gradio_inputs, youtube_data, video_type)
+    # converter = Everything2Text4Prompt(openai_api_key="")
+    # source_textbox = "youtube"
+    # target_source_textbox = "CUPe_TZECQQ"
+    # text_data, is_success, error_msg = converter.convert_text(source_textbox, target_source_textbox)
+    # print(text_data.title)
+    # print(text_data.description)
+    # print(text_data.full_content)
+    # print(text_data.ts_transcript_list)

digester/util.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+from pathlib import Path
+import tiktoken
+import yaml
+tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
+class GradioInputs:
+    """
+    This DTO class formalized the format of "inputs" from gradio and prevent long signature
+    It will be converted in GradioMethodService.
+    """
+    def __init__(self, apikey_textbox, source_textbox, source_target_textbox, qa_textbox, gpt_model_textbox, language_textbox, chatbot, history):
+        self.apikey_textbox = apikey_textbox
+        self.source_textbox = source_textbox
+        self.source_target_textbox = source_target_textbox
+        self.qa_textbox = qa_textbox
+        self.gpt_model_textbox = gpt_model_textbox
+        self.language_textbox = language_textbox
+        self.chatbot = chatbot
+        self.history = history
+class Prompt:
+    """
+    Define the prompt structure
+    Prompt = "{prompt_prefix}{prompt_main}{prompt_suffix}"
+    where if the prompt is too long, {prompt_main} will be splitted into multiple parts to fulfill context length of LLM
+    Example: for Youtube-timestamped summary
+        prompt_prefix: Youtube Video types definitions, Title
+        prompt_main: transcript (splittable)
+        prompt_suffix: task description / constraints
+    """
+    def __init__(self, prompt_prefix, prompt_main, prompt_suffix):
+        self.prompt_prefix = prompt_prefix
+        self.prompt_main = prompt_main
+        self.prompt_suffix = prompt_suffix
+def get_project_root():
+    return Path(__file__).parent.parent
+def get_config():
+    with open(os.path.join(get_project_root(), 'config/config.yaml'), encoding='utf-8') as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+    try:
+        with open(os.path.join(get_project_root(), 'config/config_secret.yaml'), encoding='utf-8') as f:
+            config_secret = yaml.load(f, Loader=yaml.FullLoader)
+            config.update(config_secret)
+    except FileNotFoundError:
+        pass  # okay to not have config_secret.yaml
+    return config
+def get_token(text: str):
+    return len(tokenizer.encode(text, disallowed_special=()))
+def get_first_n_tokens_and_remaining(text: str, n: int):
+    tokens = tokenizer.encode(text, disallowed_special=())
+    return tokenizer.decode(tokens[:n]), tokenizer.decode(tokens[n:])
+def provide_text_with_css(text, color):
+    if color == "red":
+        return f'<span style="background-color: red; color: white; padding: 3px; border-radius: 8px;">{text}</span>'
+    elif color == "green":
+        return f'<span style="background-color: #307530; color: white; padding: 3px; border-radius: 8px;">{text}</span>'
+    elif color == "blue":
+        return f'<span style="background-color: #7b7bff; color: white; padding: 3px; border-radius: 8px;">{text}</span>'
+    elif color == "yellow":
+        return f'<span style="background-color: yellow; color: black; padding: 3px; border-radius: 8px;">{text}</span>'
+    else:
+        return text
+if __name__ == '__main__':
+    # print(get_token("def get_token(text: str)"))
+    # print(get_token("皆さんこんにちは"))
+    print(get_first_n_tokens_and_remaining("This is a string with some text to tokenize.", 30))

img/final_full_summary.png ADDED Viewed

img/in_process.png ADDED Viewed

img/multi_language.png ADDED Viewed

img/n_things_example.png ADDED Viewed

main.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+import threading
+import time
+import webbrowser
+from digester.gradio_ui_service import GradioUIService
+from digester.util import get_config
+os.makedirs("analyzer_logs", exist_ok=True)
+def opentab_with_delay(port):
+    def open():
+        time.sleep(2)
+        webbrowser.open_new_tab(f"http://localhost:{port}/?__theme=dark")
+    threading.Thread(target=open, name="open-browser", daemon=True).start()
+if __name__ == '__main__':
+    config = get_config()
+    port = config["gradio"]["port"]
+    opentab_with_delay(port)
+    demo = GradioUIService.get_gradio_ui()
+    demo.queue(concurrency_count=config['gradio']['concurrent']).launch(
+        server_name="0.0.0.0", server_port=port,
+        share=True
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio==3.24.1
+gradio_client==0.0.7
+tiktoken>=0.3.3
+openai
+Markdown
+latex2mathml
+everything2text4prompt