Spaces:

imh0
/

Doodle

Sleeping

App Files Files Community

im commited on Aug 28, 2023

Commit

97b7ebb

1 Parent(s): e304b0b

init commit

Browse files

Files changed (8) hide show

.gitattributes +1 -0
.gitignore +166 -0
.streamlit/config.toml +3 -0
LICENSE +21 -0
README.md +11 -4
app.py +254 -0
assets/doodle.jpg +0 -0
requirements.txt +7 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/doodle.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,166 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# flask
+flask_session
+*.log
+datasets/
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+.streamlit/secrets.toml

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[theme]
+base="light"
+font="sans serif"

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Runzhe Yang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: Doodle
-emoji: 🦀
-colorFrom: gray
-colorTo: purple
 sdk: streamlit
 sdk_version: 1.26.0
 app_file: app.py
@@ -10,4 +10,11 @@ pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Doodle
+emoji: 🌖
+colorFrom: green
+colorTo: green
 sdk: streamlit
 sdk_version: 1.26.0
 app_file: app.py
 license: mit
 ---
+# Doodle
+## Limitations and Disclaimer
+Hey there, eagle-eyed reader! 👀 Just a quick doodle of a note: if you spot any names or brands that ring a bell, rest assured it's all in the spirit of cosmic coincidence. We're not trying to mimic, mirror, or muddle with anyone's mojo. Doodle is all about good vibes and intellectual frolics, not about stealing someone's thunder. 🌩️ So relax, enjoy, and remember: any resemblance to existing entities is as unplanned as that doodle that accidentally looked like your high school gym teacher. Happy Doodling!
+While the application aims to provide informative and engaging dialogues, it's important to note that the AI's responses are generated based on pre-existing knowledge and may not always reflect the most current or accurate information. Always cross-check critical information with other sources.

app.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import streamlit as st
+import openai
+import logging
+import sys
+import os
+import re
+from langchain.chat_models import ChatOpenAI
+from langchain.llms import OpenAI
+from crawlbase import CrawlingAPI
+from langchain.output_parsers import StructuredOutputParser
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores import Qdrant
+from langchain.prompts import ChatPromptTemplate
+from elevenlabs import generate, play, set_api_key
+from langchain.schema import (
+    AIMessage,
+    HumanMessage,
+    SystemMessage
+)
+import random
+set_api_key(st.secrets["ELEVENLABS_API_KEY"])
+crawling_api_key = st.secrets["CRAWLING_API_KEY"]
+open_api_key = st.secrets["OPENAI_API_KEY"]
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+PAGE_TITLE: str = "Doodle"
+PAGE_ICON: str = "🗨️"
+st.set_page_config(page_title=PAGE_TITLE, page_icon=PAGE_ICON)
+def get_llm(model_name, model_temperature, api_key, max_tokens=None):
+    if model_name == "text-davinci-003":
+        return OpenAI(temperature=model_temperature, model_name=model_name, max_tokens=max_tokens,
+                      openai_api_key=api_key)
+    else:
+        return ChatOpenAI(temperature=model_temperature, model_name=model_name, max_tokens=max_tokens,
+                          openai_api_key=api_key)
+def is_valid_web_link(text):
+    # Regular expression pattern to match a valid URL
+    url_pattern = re.compile(
+        r"^(https?)://"
+        r"[\w\-]+(\.[\w\-]+)+"  # Domain name (e.g., www.example.com)
+        r"(:\d+)?(/[\w\-./?%&=]*)?$"  # Optional port and path
+    )
+    return bool(url_pattern.match(text))
+@st.cache_data
+def scrape_the_article(url):
+    api = CrawlingAPI({'token': crawling_api_key})
+    response = api.get(url, options={'format': 'json', 'autoparse': 'true', 'scroll': 'true'})
+    # dict_keys(['alert', 'title', 'favicon', 'meta', 'content', 'canonical', 'images', 'grouped_images', 'og_images', 'links'])
+    content = response['json']
+    return content
+def init_session() -> None:
+    st.session_state.context = None
+    st.session_state.question = None
+    st.session_state.sub_questions = None
+    st.session_state.end = False
+    st.session_state.messages = []
+    st.session_state.open_api_key = open_api_key
+@st.cache_data
+def get_content_summary(content, model_name, api_key):
+    llm = get_llm(model_name=model_name, model_temperature=0, api_key=api_key)
+    format_instructions = \
+        """
+        The output should be a markdown code snippet formatted in the following schema, including the leading and trailing \\"```json\\" and \\"```\\":
+        ```json{
+        "summary": string // overall text summary
+        "blocks": [
+        {
+            "block_summary": string // The summary of the first block
+            "block_question": string // What is the question to clarify?
+        }, ...
+        ]}
+        ```
+        """
+    prompt_template = """You are an advanced copywriter who can discuss and summarise articles. Translate the text to English if required. You instructions: 1) Write a concise summary of the whole text; 2) Break down the text into logical blocks containing unique information, extract important information for each block and write a summary using this information; 3) Generate relevant critical questions. Here is the text:
+    ``` {text} ```
+    Format instructions: ``` {format_instructions} ```
+    Answer:"""
+    prompt = ChatPromptTemplate.from_template(template=prompt_template)
+    messages = prompt.format_messages(text=content, format_instructions=format_instructions)
+    logging.info(messages)
+    response = llm(messages)
+    logging.info(response)
+    output_parser = StructuredOutputParser.from_response_schemas([])
+    output_dict = output_parser.parse(response.content)
+    return output_dict
+@st.cache_data
+def generate_audio(text):
+    audio = generate(
+        text=text,
+        voice="Matthew" if random.randint(1, 10) % 2 == 0 else 'Dorothy',
+        model="eleven_monolingual_v1"
+    )
+    return audio
+@st.cache_resource
+def get_retriever(content):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=300,  # it depends on the retriever parameters and the model's context length
+        chunk_overlap=20,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    docs = text_splitter.create_documents([content])
+    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
+    qdrant = Qdrant.from_documents(
+        docs, embeddings,
+        location=":memory:",
+        collection_name="qa"
+    )
+    return qdrant
+@st.cache_data
+def qa(query, documents_to_search, model_name, api_key):
+    retriever = get_retriever(st.session_state.content)
+    found_docs = retriever.similarity_search(query, k=documents_to_search)
+    llm = get_llm(model_name=model_name, model_temperature=0, api_key=api_key)
+    template = \
+        """
+        You're an experienced copywriter. Answer the question in English. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
+        Answer the question in a way so that the reader has no more questions. Be concise. Make sure you mention all the important information. You can add additional relevant information from yourself that you think may contribute to the overall understanding. Asses critically the provided context, chat history or own answer.
+        Chat History: ``` {chat_history} ```
+        Context: ``` {context} ```
+        Question: ``` {question} ```
+        Helpful Answer:
+    """
+    prompt = ChatPromptTemplate.from_template(template=template)
+    chat_history = [AIMessage(content=' '.join(st.session_state.content_block_summary))]
+    messages = prompt.format_messages(context=found_docs, question=query, chat_history=chat_history)
+    response = llm(messages)
+    return response.content
+def show_audio_message(message):
+    st.write(message)
+    content_summary_audio = generate_audio(message)
+    st.audio(content_summary_audio)
+def show_question_input():
+    def submit_question():
+        if len(st.session_state.user_question_widget) != 0:
+            st.session_state.question = st.session_state.user_question_widget
+            st.session_state.user_question_widget = ''
+        else:
+            logging.info("empty user question")
+    st.text_area(label="Ask your question about the content of the page:",
+                 key='user_question_widget',
+                 on_change=submit_question)
+    st.button("Submit")
+    def on_question_button(question):
+        st.session_state.question = question
+    with st.expander("Example questions:"):
+        for q in st.session_state.content_block_questions:
+            st.button(q, on_click=on_question_button, args=[q])
+def main() -> None:
+    try:
+        if 'web_url' in st.session_state:
+            col1, col2 = st.columns(2)
+            col1.caption(f"discussing: {st.session_state.web_url}")
+            col2.caption(f"{st.session_state.title}")
+        if 'content' not in st.session_state:
+            init_session()
+            st.header("Doodle")
+            st.image("./assets/doodle.jpg")
+            description = """\
+                Meet 'Doodle,' your shortcut to understanding the web! Got a lengthy article you're eyeing?
+                Just paste the link, and in an instant, Doodle delivers a crisp summary and intriguing questions for you to
+                chew on. Want to go hands-free? Doodle's text-to-speech feature will read it to you! Why the name 'Doodle'?
+                Just as a simple doodle can encapsulate a whole idea, we distill webpages down to their essence!
+            """
+            st.caption(description)
+            st.divider()
+            content_url = st.text_input(label='Paste your link, e.g. https://expresso.today', label_visibility='collapsed',
+                                        placeholder='Paste your link:')
+            col1, _, _, _, col2 = st.columns(5)
+            col1.button("Doodle")
+            if col2.button("Random Page"):
+                content_url = 'https://mailchi.mp/expresso/lightpeak'
+            if len(content_url) > 0:
+                if is_valid_web_link(content_url):
+                    with st.spinner(f"reading the web page '{content_url}' ..."):
+                        st.session_state.web_url = content_url
+                        st.session_state.web_page = scrape_the_article(content_url)
+                        st.session_state.title = st.session_state.web_page['title']
+                        st.session_state.content = st.session_state.web_page['content']
+                    st.experimental_rerun()
+                else:
+                    st.warning("invalid link")
+        elif 'content_summary' not in st.session_state:
+            content_summary = get_content_summary(content=st.session_state.content, model_name="gpt-3.5-turbo-16k",
+                                                  api_key=st.session_state.open_api_key)
+            st.session_state.content_summary = content_summary['summary']
+            st.session_state.content_block_summary = [s['block_summary'] for s in content_summary['blocks']]
+            st.session_state.content_block_questions = [s['block_question'] for s in content_summary['blocks']]
+            show_audio_message(st.session_state.content_summary)
+            show_question_input()
+        elif 'question' in st.session_state and st.session_state.question is not None:
+            question = st.session_state.question
+            st.subheader(question)
+            st.divider()
+            with st.spinner(f'answering the question...'):
+                answer = qa(query=question, documents_to_search=20, model_name='gpt-4',
+                            api_key=st.session_state.open_api_key)
+                if random.randint(0, 10) % 2 == 0:
+                    raise Exception("test")
+                show_audio_message(answer)
+                st.session_state.question = None
+                show_question_input()
+        else:
+            show_question_input()
+    except Exception as e:
+        st.warning("Whoops, looks like a hiccup in the system! But no worries, our tech wizards are already on the case, working their magic. In the meantime, how about giving it another shot?")
+        if st.button("Give It Another Go!"):
+            st.experimental_rerun()
+if __name__ == "__main__":
+    main()
+# TODO:
+# - chat history
+# - store history externaly along with audio description and return from cache

assets/doodle.jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+openai~=0.27.9
+streamlit~=1.26.0
+langchain~=0.0.273
+crawlbase~=1.0.0
+tiktoken~=0.4.0
+qdrant-client~=1.4.0
+elevenlabs~=0.2.24