Spaces:

gabrielchua
/

open-notebooklm

Running on T4

App Files Files Community

gabriel chua commited on Sep 29, 2024

Commit

9c20b4e

1 Parent(s): 3cf1f43

add opensouce version

Browse files

Files changed (11) hide show

.dockerignore +0 -164
Dockerfile +0 -11
description.md +0 -5
docker-compose.yml +0 -8
head.html +0 -49
main.py +46 -113
prompts.py +43 -0
requirements.txt +8 -8
static/icon.png +0 -0
static/logo.png +0 -0
utils.py +71 -0

.dockerignore DELETED Viewed

@@ -1,164 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-.pybuilder/
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
-.pdm.toml
-.pdm-python
-.pdm-build/
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Cython debug symbols
-cython_debug/
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-gradio_cached_examples/

Dockerfile DELETED Viewed

@@ -1,11 +0,0 @@
-FROM python:3.12-slim
-RUN pip install uv
-RUN uv venv
-COPY requirements.txt .
-RUN uv pip install -r requirements.txt
-COPY . .
-CMD .venv/bin/granian --interface asgi --port 8080 --host 0.0.0.0  main:app

description.md DELETED Viewed

@@ -1,5 +0,0 @@
-<p style="text-align:center">
-  <strong>Convert any PDF into a podcast episode! Experience research papers, websites, and more in a whole new way.</strong>
-  <br>
-  <a href="https://github.com/knowsuchagency/pdf-to-podcast">knowsuchagency/pdf-to-podcast</a>
-</p>

docker-compose.yml DELETED Viewed

@@ -1,8 +0,0 @@
-services:
-  web:
-    build: .
-    ports:
-      - "8080:8080"
-    environment:
-      GEMINI_API_KEY: ${GEMINI_API_KEY}

head.html DELETED Viewed

@@ -1,49 +0,0 @@
-<!-- Primary Meta Tags -->
-<title>PDF to Podcast - Convert Your Documents to Audio</title>
-<meta name="title" content="PDF to Podcast - Convert Your Documents to Audio" />
-<meta
-  name="description"
-  content="Easily convert your PDF documents into audio podcasts. Perfect for listening on the go and making content more accessible."
-/>
-<!-- Open Graph / Facebook -->
-<meta property="og:type" content="website" />
-<meta property="og:url" content="https://pdf-to-podcast.com/" />
-<meta
-  property="og:title"
-  content="PDF to Podcast - Convert Your Documents to Audio"
-/>
-<meta
-  property="og:description"
-  content="Easily convert your PDF documents into audio podcasts. Perfect for listening on the go and making content more accessible."
-/>
-<meta
-  property="og:image"
-  content="https://pdf-to-podcast.com/static/logo.png"
-/>
-<!-- Twitter -->
-<meta property="twitter:card" content="summary_large_image" />
-<meta property="twitter:url" content="https://pdf-to-podcast.com/" />
-<meta
-  property="twitter:title"
-  content="PDF to Podcast - Convert Your Documents to Audio"
-/>
-<meta
-  property="twitter:description"
-  content="Easily convert your PDF documents into audio podcasts. Perfect for listening on the go and making content more accessible."
-/>
-<meta
-  property="twitter:image"
-  content="https://pdf-to-podcast.com/static/logo.png"
-/>
-<!-- Additional Meta Tags -->
-<meta name="viewport" content="width=device-width, initial-scale=1" />
-<meta charset="UTF-8" />
-<meta name="author" content="Stephan Fitzpatrick" />
-<meta
-  name="keywords"
-  content="PDF to Podcast, PDF to audio, document to podcast, audio conversion, podcast creation, accessible content"
-/>
-<link rel="icon" href="/static/icon.png" type="image/png" />

main.py CHANGED Viewed

@@ -1,25 +1,27 @@
-import concurrent.futures as cf
 import glob
-import io
 import os
 import time
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import List, Literal
 import gradio as gr
-import sentry_sdk
 from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
 from loguru import logger
-from openai import OpenAI
-from promptic import llm
-from pydantic import BaseModel, ValidationError
 from pypdf import PdfReader
-from tenacity import retry, retry_if_exception_type
-sentry_sdk.init(os.getenv("SENTRY_DSN"))
 app = FastAPI()
@@ -27,158 +29,89 @@ app.mount("/static", StaticFiles(directory="static"), name="static")
 class DialogueItem(BaseModel):
-    text: str
-    speaker: Literal["female-1", "male-1", "female-2"]
-    @property
-    def voice(self):
-        return {
-            "female-1": "alloy",
-            "male-1": "onyx",
-            "female-2": "shimmer",
-        }[self.speaker]
 class Dialogue(BaseModel):
     scratchpad: str
     dialogue: List[DialogueItem]
-def get_mp3(text: str, voice: str, api_key: str = None) -> bytes:
-    client = OpenAI(
-        api_key=api_key or os.getenv("OPENAI_API_KEY"),
-    )
-    with client.audio.speech.with_streaming_response.create(
-        model="tts-1",
-        voice=voice,
-        input=text,
-    ) as response:
-        with io.BytesIO() as file:
-            for chunk in response.iter_bytes():
-                file.write(chunk)
-            return file.getvalue()
-def generate_audio(file: str, openai_api_key: str = None) -> bytes:
-    if not os.getenv("OPENAI_API_KEY", openai_api_key):
-        raise gr.Error("OpenAI API key is required")
     with Path(file).open("rb") as f:
         reader = PdfReader(f)
         text = "\n\n".join([page.extract_text() for page in reader.pages])
-    @retry(retry=retry_if_exception_type(ValidationError))
-    @llm(
-        model="gpt-4o-mini",
-    )
-    def generate_dialogue(text: str) -> Dialogue:
-        """
-        Your task is to take the input text provided and turn it into an engaging, informative podcast dialogue. The input text may be messy or unstructured, as it could come from a variety of sources like PDFs or web pages. Don't worry about the formatting issues or any irrelevant information; your goal is to extract the key points and interesting facts that could be discussed in a podcast.
-        Here is the input text you will be working with:
-        <input_text>
-        {text}
-        </input_text>
-        First, carefully read through the input text and identify the main topics, key points, and any interesting facts or anecdotes. Think about how you could present this information in a fun, engaging way that would be suitable for an audio podcast.
-        <scratchpad>
-        Brainstorm creative ways to discuss the main topics and key points you identified in the input text. Consider using analogies, storytelling techniques, or hypothetical scenarios to make the content more relatable and engaging for listeners.
-        Keep in mind that your podcast should be accessible to a general audience, so avoid using too much jargon or assuming prior knowledge of the topic. If necessary, think of ways to briefly explain any complex concepts in simple terms.
-        Use your imagination to fill in any gaps in the input text or to come up with thought-provoking questions that could be explored in the podcast. The goal is to create an informative and entertaining dialogue, so feel free to be creative in your approach.
-        Write your brainstorming ideas and a rough outline for the podcast dialogue here. Be sure to note the key insights and takeaways you want to reiterate at the end.
-        </scratchpad>
-        Now that you have brainstormed ideas and created a rough outline, it's time to write the actual podcast dialogue. Aim for a natural, conversational flow between the host and any guest speakers. Incorporate the best ideas from your brainstorming session and make sure to explain any complex topics in an easy-to-understand way.
-        <podcast_dialogue>
-        Write your engaging, informative podcast dialogue here, based on the key points and creative ideas you came up with during the brainstorming session. Use a conversational tone and include any necessary context or explanations to make the content accessible to a general audience. Use made-up names for the hosts and guests to create a more engaging and immersive experience for listeners. Do not include any bracketed placeholders like [Host] or [Guest]. Design your output to be read aloud -- it will be directly converted into audio.
-        Make the dialogue as long and detailed as possible, while still staying on topic and maintaining an engaging flow. Aim to use your full output capacity to create the longest podcast episode you can, while still communicating the key information from the input text in an entertaining way.
-        At the end of the dialogue, have the host and guest speakers naturally summarize the main insights and takeaways from their discussion. This should flow organically from the conversation, reiterating the key points in a casual, conversational manner. Avoid making it sound like an obvious recap - the goal is to reinforce the central ideas one last time before signing off.
-        </podcast_dialogue>
-        """
-    llm_output = generate_dialogue(text)
-    audio = b""
     transcript = ""
-    characters = 0
-    with cf.ThreadPoolExecutor() as executor:
-        futures = []
-        for line in llm_output.dialogue:
-            transcript_line = f"{line.speaker}: {line.text}"
-            future = executor.submit(get_mp3, line.text, line.voice, openai_api_key)
-            futures.append((future, transcript_line))
-            characters += len(line.text)
-        for future, transcript_line in futures:
-            audio_chunk = future.result()
-            audio += audio_chunk
-            transcript += transcript_line + "\n\n"
-    logger.info(f"Generated {characters} characters of audio")
     temporary_directory = "./gradio_cached_examples/tmp/"
     os.makedirs(temporary_directory, exist_ok=True)
-    # we use a temporary file because Gradio's audio component doesn't work with raw bytes in Safari
     temporary_file = NamedTemporaryFile(
         dir=temporary_directory,
         delete=False,
         suffix=".mp3",
     )
-    temporary_file.write(audio)
-    temporary_file.close()
     # Delete any files in the temp directory that end with .mp3 and are over a day old
     for file in glob.glob(f"{temporary_directory}*.mp3"):
         if os.path.isfile(file) and time.time() - os.path.getmtime(file) > 24 * 60 * 60:
             os.remove(file)
     return temporary_file.name, transcript
 demo = gr.Interface(
-    title="PDF to Podcast",
-    description=Path("description.md").read_text(),
-    fn=generate_audio,
-    examples=[[str(p)] for p in Path("examples").glob("*.pdf")],
     inputs=[
         gr.File(
             label="PDF",
         ),
-        gr.Textbox(
-            label="OpenAI API Key",
-            visible=not os.getenv("OPENAI_API_KEY"),
-        ),
     ],
     outputs=[
         gr.Audio(label="Audio", format="mp3"),
         gr.Textbox(label="Transcript"),
     ],
     allow_flagging="never",
-    clear_btn=None,
-    head=os.getenv("HEAD", "") + Path("head.html").read_text(),
-    cache_examples="lazy",
     api_name=False,
 )
-demo = demo.queue(
-    max_size=20,
-    default_concurrency_limit=20,
-)
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":

+"""
+main.py
+"""
+# Standard library imports
 import glob
 import os
 import time
 from pathlib import Path
 from tempfile import NamedTemporaryFile
+from typing import List, Literal, Tuple
+# Third-party imports
 import gradio as gr
 from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
 from loguru import logger
+from pydantic import BaseModel
 from pypdf import PdfReader
+from pydub import AudioSegment
+# Local imports
+from prompts import SYSTEM_PROMPT
+from utils import generate_script, generate_audio
 app = FastAPI()
 class DialogueItem(BaseModel):
+    """A single dialogue item."""
+    speaker: Literal["Host (Jane)", "Guest"]
+    text: str
 class Dialogue(BaseModel):
+    """The dialogue between the host and guest."""
     scratchpad: str
+    participants: List[str]
     dialogue: List[DialogueItem]
+def generate_podcast(file: str) -> Tuple[str, str]:
+    """Generate the audio and transcript from the PDF."""
+    # Read the PDF file and extract text
     with Path(file).open("rb") as f:
         reader = PdfReader(f)
         text = "\n\n".join([page.extract_text() for page in reader.pages])
+    # Call the LLM
+    llm_output = generate_script(SYSTEM_PROMPT, text, Dialogue)
+    logger.info(f"Generated dialogue: {llm_output}")
+    # Process the dialogue
+    audio_segments = []
     transcript = ""
+    total_characters = 0
+    for line in llm_output.dialogue:
+        logger.info(f"Generating audio for {line.speaker}: {line.text}")
+        transcript_line = f"{line.speaker}: {line.text}"
+        transcript += transcript_line + "\n\n"
+        total_characters += len(line.text)
+        # Get audio file path
+        audio_file_path = generate_audio(line.text, line.speaker)
+        # Read the audio file into an AudioSegment
+        audio_segment = AudioSegment.from_file(audio_file_path)
+        audio_segments.append(audio_segment)
+    # Concatenate all audio segments
+    combined_audio = sum(audio_segments)
+    # Export the combined audio to a temporary file
     temporary_directory = "./gradio_cached_examples/tmp/"
     os.makedirs(temporary_directory, exist_ok=True)
     temporary_file = NamedTemporaryFile(
         dir=temporary_directory,
         delete=False,
         suffix=".mp3",
     )
+    combined_audio.export(temporary_file.name, format="mp3")
     # Delete any files in the temp directory that end with .mp3 and are over a day old
     for file in glob.glob(f"{temporary_directory}*.mp3"):
         if os.path.isfile(file) and time.time() - os.path.getmtime(file) > 24 * 60 * 60:
             os.remove(file)
+    logger.info(f"Generated {total_characters} characters of audio")
     return temporary_file.name, transcript
 demo = gr.Interface(
+    title="OpenPodcast",
+    description="Convert your PDFs into podcasts with open-source AI models.",
+    fn=generate_podcast,
     inputs=[
         gr.File(
             label="PDF",
         ),
     ],
     outputs=[
         gr.Audio(label="Audio", format="mp3"),
         gr.Textbox(label="Transcript"),
     ],
     allow_flagging="never",
     api_name=False,
 )
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":

prompts.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+prompts.py
+"""
+SYSTEM_PROMPT = """
+You are a world-class podcast producer.
+Your task is to transform the provided input text into an engaging and informative podcast script.
+You will receive as input a text that may be unstructured or messy, sourced from places like PDFs or web pages. Ignore irrelevant information or formatting issues. Y
+Your focus is on extracting the most interesting and insightful content for a podcast discussion.
+# Steps to Follow:
+1. **Analyze the Input:**
+   Carefully read the input text. Identify the key topics, points, and any interesting facts or anecdotes that could drive a compelling podcast conversation.
+2. **Brainstorm Ideas:**
+   In the `<scratchpad>`, brainstorm creative ways to present the key points in an engaging manner. Think of analogies, storytelling techniques, or hypothetical scenarios to make the content relatable and entertaining for listeners.
+   - Keep the discussion accessible to a general audience. Avoid jargon and briefly explain complex concepts in simple terms.
+   - Use imagination to fill in any gaps or create thought-provoking questions to explore during the podcast.
+   - Your aim is to create an entertaining and informative podcast, so feel free to be creative with your approach.
+3. **Write the Dialogue:**
+   Now, develop the podcast dialogue. Aim for a natural, conversational flow between the host (named Jane) and the guest speaker (the author of the input text, if mentioned).
+   - Use the best ideas from your brainstorming session.
+   - Ensure complex topics are explained clearly and simply.
+   - Focus on maintaining an engaging and lively tone that would captivate listeners.
+   - Rules:
+        > The host should go first.
+        > The host should ask the guest questions.
+        > The host should summarize the key insights at the end.
+        > Include common verbal fillers like "uhms" and "errs" in the host and guests response. This is so the script is realistic.
+        > The host and guest can interrupt each other.
+        > The guest must NOT include marketing or self-promotional content.
+        > The guest must NOT include any material NOT substantiated within the input text.
+        > This is to be a PG conversation.
+4. **Wrap it Up:**
+   At the end of the dialogue, the host and guest should naturally summarize the key insights. This should feel like a casual conversation, rather than a formal recap, reinforcing the main points one last time before signing off.
+ALWAYS REPLY IN VALID JSON, AND NO CODE BLOCKS. BEGIN DIRECTLY WITH THE JSON OUTPUT.
+"""

requirements.txt CHANGED Viewed

@@ -1,9 +1,9 @@
-gradio~=4.36
 promptic==0.7.5
-pydantic~=2.7
-google-generativeai~=0.6
-loguru~=0.7
-pypdf~=4.1
-tenacity~=8.3
-sentry-sdk~=2.5
-granian~=1.4

+gradio==4.44.0
+granian==1.4
+loguru==0.7
+openai==1.50.2
 promptic==0.7.5
+pydantic==2.7
+pypdf==4.1
+sentry-sdk==2.5
+tenacity==8.3

static/icon.png DELETED Viewed

Binary file (1.34 kB)

static/logo.png DELETED Viewed

Binary file (134 kB)

utils.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+utils.py
+Functions:
+- get_script: Get the dialogue from the LLM.
+- call_llm: Call the LLM with the given prompt and dialogue format.
+- get_audio: Get the audio from the TTS model from HF Spaces.
+"""
+import os
+from gradio_client import Client
+from openai import OpenAI
+from pydantic import ValidationError
+client = OpenAI(
+    base_url="https://api.fireworks.ai/inference/v1",
+    api_key=os.getenv("FIREWORKS_API_KEY"),
+)
+hf_client = Client("mrfakename/MeloTTS")
+def generate_script(system_prompt: str, text: str, dialogue_format):
+    """Get the dialogue from the LLM."""
+    # Load as python object
+    try:
+        response = call_llm(system_prompt, text, dialogue_format)
+        dialogue = dialogue_format.model_validate_json(
+            response.choices[0].message.content
+        )
+    except ValidationError as e:
+        error_message = f"Failed to parse dialogue JSON: {e}"
+        system_prompt_with_error = f"{system_prompt}\n\n Please return a VALID JSON object. This was the earlier error: {error_message}"
+        response = call_llm(system_prompt_with_error, text, dialogue_format)
+        dialogue = dialogue_format.model_validate_json(
+            response.choices[0].message.content
+        )
+    return dialogue
+def call_llm(system_prompt: str, text: str, dialogue_format):
+    """Call the LLM with the given prompt and dialogue format."""
+    response = client.chat.completions.create(
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": text},
+        ],
+        model="accounts/fireworks/models/llama-v3p1-405b-instruct",
+        max_tokens=16_384,
+        temperature=0.1,
+        response_format={
+            "type": "json_object",
+            "schema": dialogue_format.model_json_schema(),
+        },
+    )
+    return response
+def generate_audio(text: str, speaker: str) -> str:
+    """Get the audio from the TTS model from HF Spaces."""
+    if speaker == "Guest":
+        accent = "EN-US"
+        speed = 0.9
+    else:  # host
+        accent = "EN-Default"
+        speed = 1
+    result = hf_client.predict(
+        text=text, language="EN", speaker=accent, speed=speed, api_name="/synthesize"
+    )
+    return result