im commited on
Commit
97b7ebb
·
1 Parent(s): e304b0b

init commit

Browse files
Files changed (8) hide show
  1. .gitattributes +1 -0
  2. .gitignore +166 -0
  3. .streamlit/config.toml +3 -0
  4. LICENSE +21 -0
  5. README.md +11 -4
  6. app.py +254 -0
  7. assets/doodle.jpg +0 -0
  8. requirements.txt +7 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/doodle.jpg filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # flask
86
+ flask_session
87
+ *.log
88
+ datasets/
89
+
90
+ # pyenv
91
+ # For a library or package, you might want to ignore these files since the code is
92
+ # intended to run in multiple environments; otherwise, check them in:
93
+ # .python-version
94
+
95
+ # pipenv
96
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
97
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
98
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
99
+ # install all needed dependencies.
100
+ #Pipfile.lock
101
+
102
+ # poetry
103
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
104
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
105
+ # commonly ignored for libraries.
106
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
107
+ #poetry.lock
108
+
109
+ # pdm
110
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
111
+ #pdm.lock
112
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
113
+ # in version control.
114
+ # https://pdm.fming.dev/#use-with-ide
115
+ .pdm.toml
116
+
117
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
118
+ __pypackages__/
119
+
120
+ # Celery stuff
121
+ celerybeat-schedule
122
+ celerybeat.pid
123
+
124
+ # SageMath parsed files
125
+ *.sage.py
126
+
127
+ # Environments
128
+ .env
129
+ .venv
130
+ env/
131
+ venv/
132
+ ENV/
133
+ env.bak/
134
+ venv.bak/
135
+
136
+ # Spyder project settings
137
+ .spyderproject
138
+ .spyproject
139
+
140
+ # Rope project settings
141
+ .ropeproject
142
+
143
+ # mkdocs documentation
144
+ /site
145
+
146
+ # mypy
147
+ .mypy_cache/
148
+ .dmypy.json
149
+ dmypy.json
150
+
151
+ # Pyre type checker
152
+ .pyre/
153
+
154
+ # pytype static type analyzer
155
+ .pytype/
156
+
157
+ # Cython debug symbols
158
+ cython_debug/
159
+
160
+ # PyCharm
161
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
162
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
163
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
164
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
165
+ .idea/
166
+ .streamlit/secrets.toml
.streamlit/config.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+ font="sans serif"
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Runzhe Yang
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: Doodle
3
- emoji: 🦀
4
- colorFrom: gray
5
- colorTo: purple
6
  sdk: streamlit
7
  sdk_version: 1.26.0
8
  app_file: app.py
@@ -10,4 +10,11 @@ pinned: false
10
  license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
1
  ---
2
  title: Doodle
3
+ emoji: 🌖
4
+ colorFrom: green
5
+ colorTo: green
6
  sdk: streamlit
7
  sdk_version: 1.26.0
8
  app_file: app.py
 
10
  license: mit
11
  ---
12
 
13
+ # Doodle
14
+
15
+ ## Limitations and Disclaimer
16
+
17
+ Hey there, eagle-eyed reader! 👀 Just a quick doodle of a note: if you spot any names or brands that ring a bell, rest assured it's all in the spirit of cosmic coincidence. We're not trying to mimic, mirror, or muddle with anyone's mojo. Doodle is all about good vibes and intellectual frolics, not about stealing someone's thunder. 🌩️ So relax, enjoy, and remember: any resemblance to existing entities is as unplanned as that doodle that accidentally looked like your high school gym teacher. Happy Doodling!
18
+
19
+ While the application aims to provide informative and engaging dialogues, it's important to note that the AI's responses are generated based on pre-existing knowledge and may not always reflect the most current or accurate information. Always cross-check critical information with other sources.
20
+
app.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import openai
3
+ import logging
4
+ import sys
5
+ import os
6
+ import re
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.llms import OpenAI
9
+ from crawlbase import CrawlingAPI
10
+ from langchain.output_parsers import StructuredOutputParser
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain.embeddings.openai import OpenAIEmbeddings
13
+ from langchain.vectorstores import Qdrant
14
+ from langchain.prompts import ChatPromptTemplate
15
+ from elevenlabs import generate, play, set_api_key
16
+ from langchain.schema import (
17
+ AIMessage,
18
+ HumanMessage,
19
+ SystemMessage
20
+ )
21
+ import random
22
+
23
+ set_api_key(st.secrets["ELEVENLABS_API_KEY"])
24
+ crawling_api_key = st.secrets["CRAWLING_API_KEY"]
25
+ open_api_key = st.secrets["OPENAI_API_KEY"]
26
+
27
+ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
28
+
29
+ PAGE_TITLE: str = "Doodle"
30
+ PAGE_ICON: str = "🗨️"
31
+
32
+ st.set_page_config(page_title=PAGE_TITLE, page_icon=PAGE_ICON)
33
+
34
+
35
+ def get_llm(model_name, model_temperature, api_key, max_tokens=None):
36
+ if model_name == "text-davinci-003":
37
+ return OpenAI(temperature=model_temperature, model_name=model_name, max_tokens=max_tokens,
38
+ openai_api_key=api_key)
39
+ else:
40
+ return ChatOpenAI(temperature=model_temperature, model_name=model_name, max_tokens=max_tokens,
41
+ openai_api_key=api_key)
42
+
43
+
44
+ def is_valid_web_link(text):
45
+ # Regular expression pattern to match a valid URL
46
+ url_pattern = re.compile(
47
+ r"^(https?)://"
48
+ r"[\w\-]+(\.[\w\-]+)+" # Domain name (e.g., www.example.com)
49
+ r"(:\d+)?(/[\w\-./?%&=]*)?$" # Optional port and path
50
+ )
51
+ return bool(url_pattern.match(text))
52
+
53
+
54
+ @st.cache_data
55
+ def scrape_the_article(url):
56
+ api = CrawlingAPI({'token': crawling_api_key})
57
+ response = api.get(url, options={'format': 'json', 'autoparse': 'true', 'scroll': 'true'})
58
+ # dict_keys(['alert', 'title', 'favicon', 'meta', 'content', 'canonical', 'images', 'grouped_images', 'og_images', 'links'])
59
+ content = response['json']
60
+ return content
61
+
62
+
63
+ def init_session() -> None:
64
+ st.session_state.context = None
65
+ st.session_state.question = None
66
+ st.session_state.sub_questions = None
67
+ st.session_state.end = False
68
+
69
+ st.session_state.messages = []
70
+
71
+ st.session_state.open_api_key = open_api_key
72
+
73
+
74
+ @st.cache_data
75
+ def get_content_summary(content, model_name, api_key):
76
+ llm = get_llm(model_name=model_name, model_temperature=0, api_key=api_key)
77
+ format_instructions = \
78
+ """
79
+ The output should be a markdown code snippet formatted in the following schema, including the leading and trailing \\"```json\\" and \\"```\\":
80
+ ```json{
81
+ "summary": string // overall text summary
82
+ "blocks": [
83
+ {
84
+ "block_summary": string // The summary of the first block
85
+ "block_question": string // What is the question to clarify?
86
+ }, ...
87
+ ]}
88
+ ```
89
+ """
90
+ prompt_template = """You are an advanced copywriter who can discuss and summarise articles. Translate the text to English if required. You instructions: 1) Write a concise summary of the whole text; 2) Break down the text into logical blocks containing unique information, extract important information for each block and write a summary using this information; 3) Generate relevant critical questions. Here is the text:
91
+ ``` {text} ```
92
+ Format instructions: ``` {format_instructions} ```
93
+ Answer:"""
94
+ prompt = ChatPromptTemplate.from_template(template=prompt_template)
95
+ messages = prompt.format_messages(text=content, format_instructions=format_instructions)
96
+ logging.info(messages)
97
+ response = llm(messages)
98
+ logging.info(response)
99
+
100
+ output_parser = StructuredOutputParser.from_response_schemas([])
101
+ output_dict = output_parser.parse(response.content)
102
+ return output_dict
103
+
104
+
105
+ @st.cache_data
106
+ def generate_audio(text):
107
+ audio = generate(
108
+ text=text,
109
+ voice="Matthew" if random.randint(1, 10) % 2 == 0 else 'Dorothy',
110
+ model="eleven_monolingual_v1"
111
+ )
112
+ return audio
113
+
114
+
115
+ @st.cache_resource
116
+ def get_retriever(content):
117
+ text_splitter = RecursiveCharacterTextSplitter(
118
+ chunk_size=300, # it depends on the retriever parameters and the model's context length
119
+ chunk_overlap=20,
120
+ length_function=len,
121
+ is_separator_regex=False,
122
+ )
123
+ docs = text_splitter.create_documents([content])
124
+ embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
125
+ qdrant = Qdrant.from_documents(
126
+ docs, embeddings,
127
+ location=":memory:",
128
+ collection_name="qa"
129
+ )
130
+ return qdrant
131
+
132
+
133
+ @st.cache_data
134
+ def qa(query, documents_to_search, model_name, api_key):
135
+ retriever = get_retriever(st.session_state.content)
136
+ found_docs = retriever.similarity_search(query, k=documents_to_search)
137
+ llm = get_llm(model_name=model_name, model_temperature=0, api_key=api_key)
138
+ template = \
139
+ """
140
+ You're an experienced copywriter. Answer the question in English. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
141
+ Answer the question in a way so that the reader has no more questions. Be concise. Make sure you mention all the important information. You can add additional relevant information from yourself that you think may contribute to the overall understanding. Asses critically the provided context, chat history or own answer.
142
+ Chat History: ``` {chat_history} ```
143
+ Context: ``` {context} ```
144
+ Question: ``` {question} ```
145
+ Helpful Answer:
146
+ """
147
+ prompt = ChatPromptTemplate.from_template(template=template)
148
+
149
+ chat_history = [AIMessage(content=' '.join(st.session_state.content_block_summary))]
150
+ messages = prompt.format_messages(context=found_docs, question=query, chat_history=chat_history)
151
+ response = llm(messages)
152
+ return response.content
153
+
154
+
155
+ def show_audio_message(message):
156
+ st.write(message)
157
+ content_summary_audio = generate_audio(message)
158
+ st.audio(content_summary_audio)
159
+
160
+
161
+ def show_question_input():
162
+ def submit_question():
163
+ if len(st.session_state.user_question_widget) != 0:
164
+ st.session_state.question = st.session_state.user_question_widget
165
+ st.session_state.user_question_widget = ''
166
+ else:
167
+ logging.info("empty user question")
168
+
169
+ st.text_area(label="Ask your question about the content of the page:",
170
+ key='user_question_widget',
171
+ on_change=submit_question)
172
+ st.button("Submit")
173
+
174
+ def on_question_button(question):
175
+ st.session_state.question = question
176
+
177
+ with st.expander("Example questions:"):
178
+ for q in st.session_state.content_block_questions:
179
+ st.button(q, on_click=on_question_button, args=[q])
180
+
181
+
182
+ def main() -> None:
183
+ try:
184
+ if 'web_url' in st.session_state:
185
+ col1, col2 = st.columns(2)
186
+ col1.caption(f"discussing: {st.session_state.web_url}")
187
+ col2.caption(f"{st.session_state.title}")
188
+
189
+ if 'content' not in st.session_state:
190
+ init_session()
191
+ st.header("Doodle")
192
+ st.image("./assets/doodle.jpg")
193
+ description = """\
194
+ Meet 'Doodle,' your shortcut to understanding the web! Got a lengthy article you're eyeing?
195
+ Just paste the link, and in an instant, Doodle delivers a crisp summary and intriguing questions for you to
196
+ chew on. Want to go hands-free? Doodle's text-to-speech feature will read it to you! Why the name 'Doodle'?
197
+ Just as a simple doodle can encapsulate a whole idea, we distill webpages down to their essence!
198
+ """
199
+ st.caption(description)
200
+ st.divider()
201
+
202
+ content_url = st.text_input(label='Paste your link, e.g. https://expresso.today', label_visibility='collapsed',
203
+ placeholder='Paste your link:')
204
+ col1, _, _, _, col2 = st.columns(5)
205
+ col1.button("Doodle")
206
+ if col2.button("Random Page"):
207
+ content_url = 'https://mailchi.mp/expresso/lightpeak'
208
+
209
+ if len(content_url) > 0:
210
+ if is_valid_web_link(content_url):
211
+ with st.spinner(f"reading the web page '{content_url}' ..."):
212
+ st.session_state.web_url = content_url
213
+ st.session_state.web_page = scrape_the_article(content_url)
214
+ st.session_state.title = st.session_state.web_page['title']
215
+ st.session_state.content = st.session_state.web_page['content']
216
+ st.experimental_rerun()
217
+ else:
218
+ st.warning("invalid link")
219
+ elif 'content_summary' not in st.session_state:
220
+ content_summary = get_content_summary(content=st.session_state.content, model_name="gpt-3.5-turbo-16k",
221
+ api_key=st.session_state.open_api_key)
222
+ st.session_state.content_summary = content_summary['summary']
223
+ st.session_state.content_block_summary = [s['block_summary'] for s in content_summary['blocks']]
224
+ st.session_state.content_block_questions = [s['block_question'] for s in content_summary['blocks']]
225
+
226
+ show_audio_message(st.session_state.content_summary)
227
+ show_question_input()
228
+ elif 'question' in st.session_state and st.session_state.question is not None:
229
+ question = st.session_state.question
230
+ st.subheader(question)
231
+ st.divider()
232
+ with st.spinner(f'answering the question...'):
233
+ answer = qa(query=question, documents_to_search=20, model_name='gpt-4',
234
+ api_key=st.session_state.open_api_key)
235
+ if random.randint(0, 10) % 2 == 0:
236
+ raise Exception("test")
237
+ show_audio_message(answer)
238
+ st.session_state.question = None
239
+ show_question_input()
240
+ else:
241
+ show_question_input()
242
+ except Exception as e:
243
+ st.warning("Whoops, looks like a hiccup in the system! But no worries, our tech wizards are already on the case, working their magic. In the meantime, how about giving it another shot?")
244
+ if st.button("Give It Another Go!"):
245
+ st.experimental_rerun()
246
+
247
+
248
+
249
+ if __name__ == "__main__":
250
+ main()
251
+
252
+ # TODO:
253
+ # - chat history
254
+ # - store history externaly along with audio description and return from cache
assets/doodle.jpg ADDED
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ openai~=0.27.9
2
+ streamlit~=1.26.0
3
+ langchain~=0.0.273
4
+ crawlbase~=1.0.0
5
+ tiktoken~=0.4.0
6
+ qdrant-client~=1.4.0
7
+ elevenlabs~=0.2.24