gabrielchua commited on
Commit
dc06293
β€’
1 Parent(s): 1573ac4

add jina, language support

Browse files
Files changed (2) hide show
  1. app.py +72 -27
  2. utils.py +18 -5
app.py CHANGED
@@ -13,13 +13,15 @@ from typing import List, Literal, Tuple, Optional
13
  # Third-party imports
14
  import gradio as gr
15
  from loguru import logger
 
16
  from pydantic import BaseModel
17
  from pypdf import PdfReader
18
  from pydub import AudioSegment
19
 
20
  # Local imports
21
  from prompts import SYSTEM_PROMPT
22
- from utils import generate_script, generate_audio
 
23
 
24
  class DialogueItem(BaseModel):
25
  """A single dialogue item."""
@@ -36,24 +38,55 @@ class Dialogue(BaseModel):
36
  dialogue: List[DialogueItem]
37
 
38
 
39
- def generate_podcast(file: str, tone: Optional[str] = None, length: Optional[str] = None) -> Tuple[str, str]:
40
- """Generate the audio and transcript from the PDF."""
41
- # Check if the file is a PDF
42
- if not file.lower().endswith('.pdf'):
43
- raise gr.Error("Please upload a PDF file.")
44
-
45
- # Read the PDF file and extract text
46
- try:
47
- with Path(file).open("rb") as f:
48
- reader = PdfReader(f)
49
- text = "\n\n".join([page.extract_text() for page in reader.pages])
50
- except Exception as e:
51
- raise gr.Error(f"Error reading the PDF file: {str(e)}")
52
-
53
- # Check if the PDF has more than ~150,000 characters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  if len(text) > 100000:
55
- raise gr.Error("The PDF is too long. Please upload a PDF with fewer than ~100,000 characters.")
56
-
57
  # Modify the system prompt based on the chosen tone and length
58
  modified_system_prompt = SYSTEM_PROMPT
59
  if tone:
@@ -64,6 +97,8 @@ def generate_podcast(file: str, tone: Optional[str] = None, length: Optional[str
64
  "Medium (3-5 min)": "Aim for a moderate length, about 3-5 minutes.",
65
  }
66
  modified_system_prompt += f"\n\nLENGTH: {length_instructions[length]}"
 
 
67
 
68
  # Call the LLM
69
  llm_output = generate_script(modified_system_prompt, text, Dialogue)
@@ -71,7 +106,7 @@ def generate_podcast(file: str, tone: Optional[str] = None, length: Optional[str
71
 
72
  # Process the dialogue
73
  audio_segments = []
74
- transcript = "" # start with an empty transcript
75
  total_characters = 0
76
 
77
  for line in llm_output.dialogue:
@@ -84,7 +119,7 @@ def generate_podcast(file: str, tone: Optional[str] = None, length: Optional[str
84
  total_characters += len(line.text)
85
 
86
  # Get audio file path
87
- audio_file_path = generate_audio(line.text, line.speaker)
88
  # Read the audio file into an AudioSegment
89
  audio_segment = AudioSegment.from_file(audio_file_path)
90
  audio_segments.append(audio_segment)
@@ -115,30 +150,40 @@ def generate_podcast(file: str, tone: Optional[str] = None, length: Optional[str
115
 
116
  demo = gr.Interface(
117
  title="Open NotebookLM",
118
- description="Convert your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS). \n \n Note: Only the text content of the PDF will be processed. Images and tables are not included. The PDF should be no more than 100,000 characters due to the context length of Llama 3.1 405B.",
119
  fn=generate_podcast,
120
  inputs=[
121
  gr.File(
122
- label="PDF",
123
- file_types=[".pdf", "file/*"],
 
 
 
 
 
124
  ),
125
  gr.Radio(
126
  choices=["Fun", "Formal"],
127
- label="Tone of the podcast",
128
- value="casual"
129
  ),
130
  gr.Radio(
131
  choices=["Short (1-2 min)", "Medium (3-5 min)"],
132
- label="Length of the podcast",
133
  value="Medium (3-5 min)"
134
  ),
 
 
 
 
 
135
  ],
136
  outputs=[
137
  gr.Audio(label="Audio", format="mp3"),
138
  gr.Markdown(label="Transcript"),
139
  ],
140
  allow_flagging="never",
141
- api_name="generate_podcast", # Add this line
142
  theme=gr.themes.Soft(),
143
  concurrency_limit=3
144
  )
 
13
  # Third-party imports
14
  import gradio as gr
15
  from loguru import logger
16
+ from openai import OpenAI
17
  from pydantic import BaseModel
18
  from pypdf import PdfReader
19
  from pydub import AudioSegment
20
 
21
  # Local imports
22
  from prompts import SYSTEM_PROMPT
23
+ from utils import generate_script, generate_audio, parse_url
24
+
25
 
26
  class DialogueItem(BaseModel):
27
  """A single dialogue item."""
 
38
  dialogue: List[DialogueItem]
39
 
40
 
41
+ def generate_podcast(
42
+ files: List[str],
43
+ url: Optional[str],
44
+ tone: Optional[str],
45
+ length: Optional[str],
46
+ language: str
47
+ ) -> Tuple[str, str]:
48
+ """Generate the audio and transcript from the PDFs and/or URL."""
49
+ text = ""
50
+
51
+ # Change language to the appropriate code
52
+ language_mapping = {
53
+ "English": "EN",
54
+ "Spanish": "ES",
55
+ "French": "FR",
56
+ "Chinese": "ZH",
57
+ "Japanese": "JP",
58
+ "Korean": "KR",
59
+ }
60
+
61
+ # Check if at least one input is provided
62
+ if not files and not url:
63
+ raise gr.Error("Please provide at least one PDF file or a URL.")
64
+
65
+ # Process PDFs if any
66
+ if files:
67
+ for file in files:
68
+ if not file.lower().endswith('.pdf'):
69
+ raise gr.Error(f"File {file} is not a PDF. Please upload only PDF files.")
70
+
71
+ try:
72
+ with Path(file).open("rb") as f:
73
+ reader = PdfReader(f)
74
+ text += "\n\n".join([page.extract_text() for page in reader.pages])
75
+ except Exception as e:
76
+ raise gr.Error(f"Error reading the PDF file {file}: {str(e)}")
77
+
78
+ # Process URL if provided
79
+ if url:
80
+ try:
81
+ url_text = parse_url(url)
82
+ text += "\n\n" + url_text
83
+ except ValueError as e:
84
+ raise gr.Error(str(e))
85
+
86
+ # Check total character count
87
  if len(text) > 100000:
88
+ raise gr.Error("The total content is too long. Please ensure the combined text from PDFs and URL is fewer than ~100,000 characters.")
89
+
90
  # Modify the system prompt based on the chosen tone and length
91
  modified_system_prompt = SYSTEM_PROMPT
92
  if tone:
 
97
  "Medium (3-5 min)": "Aim for a moderate length, about 3-5 minutes.",
98
  }
99
  modified_system_prompt += f"\n\nLENGTH: {length_instructions[length]}"
100
+ if language:
101
+ modified_system_prompt += f"\n\nOUTPUT LANGUAGE <IMPORTANT>: The the podcast should be {language}."
102
 
103
  # Call the LLM
104
  llm_output = generate_script(modified_system_prompt, text, Dialogue)
 
106
 
107
  # Process the dialogue
108
  audio_segments = []
109
+ transcript = ""
110
  total_characters = 0
111
 
112
  for line in llm_output.dialogue:
 
119
  total_characters += len(line.text)
120
 
121
  # Get audio file path
122
+ audio_file_path = generate_audio(line.text, line.speaker, language_mapping[language])
123
  # Read the audio file into an AudioSegment
124
  audio_segment = AudioSegment.from_file(audio_file_path)
125
  audio_segments.append(audio_segment)
 
150
 
151
  demo = gr.Interface(
152
  title="Open NotebookLM",
153
+ description="Convert your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS). \n \n Note: Only the text content of the PDFs will be processed. Images and tables are not included. The total content should be no more than 100,000 characters due to the context length of Llama 3.1 405B.",
154
  fn=generate_podcast,
155
  inputs=[
156
  gr.File(
157
+ label="1. πŸ“„ Upload your PDF(s)",
158
+ file_types=[".pdf"],
159
+ file_count="multiple"
160
+ ),
161
+ gr.Textbox(
162
+ label="2. πŸ”— Paste a URL (optional)",
163
+ placeholder="Enter a URL to include its content"
164
  ),
165
  gr.Radio(
166
  choices=["Fun", "Formal"],
167
+ label="3. 🎭 Choose the tone",
168
+ value="Fun"
169
  ),
170
  gr.Radio(
171
  choices=["Short (1-2 min)", "Medium (3-5 min)"],
172
+ label="4. ⏱️ Choose the length",
173
  value="Medium (3-5 min)"
174
  ),
175
+ gr.Dropdown(
176
+ choices=["English", "Spanish", "French", "Chinese", "Japanese", "Korean"],
177
+ value="English",
178
+ label="5. 🌐 Choose the language (Highly experimental, English is recommended)",
179
+ ),
180
  ],
181
  outputs=[
182
  gr.Audio(label="Audio", format="mp3"),
183
  gr.Markdown(label="Transcript"),
184
  ],
185
  allow_flagging="never",
186
+ api_name="generate_podcast",
187
  theme=gr.themes.Soft(),
188
  concurrency_limit=3
189
  )
utils.py CHANGED
@@ -8,12 +8,14 @@ Functions:
8
  """
9
 
10
  import os
 
11
 
12
  from gradio_client import Client
13
  from openai import OpenAI
14
  from pydantic import ValidationError
15
 
16
  MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
 
17
 
18
  client = OpenAI(
19
  base_url="https://api.fireworks.ai/inference/v1",
@@ -59,15 +61,26 @@ def call_llm(system_prompt: str, text: str, dialogue_format):
59
  return response
60
 
61
 
62
- def generate_audio(text: str, speaker: str) -> str:
63
- """Get the audio from the TTS model from HF Spaces."""
 
 
 
 
 
 
 
64
  if speaker == "Guest":
65
- accent = "EN-US"
66
  speed = 0.9
67
  else: # host
68
- accent = "EN-Default"
69
  speed = 1
 
 
 
 
70
  result = hf_client.predict(
71
- text=text, language="EN", speaker=accent, speed=speed, api_name="/synthesize"
72
  )
73
  return result
 
8
  """
9
 
10
  import os
11
+ import requests
12
 
13
  from gradio_client import Client
14
  from openai import OpenAI
15
  from pydantic import ValidationError
16
 
17
  MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
18
+ JINA_URL = "https://r.jina.ai/"
19
 
20
  client = OpenAI(
21
  base_url="https://api.fireworks.ai/inference/v1",
 
61
  return response
62
 
63
 
64
+ def parse_url(url: str) -> str:
65
+ """Parse the given URL and return the text content."""
66
+ full_url = f"{JINA_URL}{url}"
67
+ response = requests.get(full_url, timeout=60)
68
+ return response.text
69
+
70
+
71
+ def generate_audio(text: str, speaker: str, language: str) -> bytes:
72
+ """Get the audio from the TTS model from HF Spaces and adjust pitch if necessary."""
73
  if speaker == "Guest":
74
+ accent = "EN-US" if language == "EN" else language
75
  speed = 0.9
76
  else: # host
77
+ accent = "EN-Default" if language == "EN" else language
78
  speed = 1
79
+ if language != "EN" and speaker != "Guest":
80
+ speed = 1.1
81
+
82
+ # Generate audio
83
  result = hf_client.predict(
84
+ text=text, language=language, speaker=accent, speed=speed, api_name="/synthesize"
85
  )
86
  return result