shresthasingh commited on
Commit
3ef554c
·
verified ·
1 Parent(s): adafc1d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -0
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import base64
3
+ from openai import OpenAI
4
+ from pathlib import Path
5
+ import tempfile
6
+ API_KEY = os.getenv("openai")
7
+
8
+ def process_image(image_path):
9
+ client = OpenAI(api_key=API_KEY)
10
+
11
+ # Read the image file and encode to base64
12
+ with open(image_path, "rb") as image_file:
13
+ encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
14
+
15
+ # Use GPT-4 Vision to perform OCR
16
+ response = client.chat.completions.create(
17
+ model="gpt-4o-mini",
18
+ messages=[
19
+ {
20
+ "role": "system",
21
+ "content": "You are an OCR system. Extract all text from the image and return it without any additional commentary."
22
+ },
23
+ {
24
+ "role": "user",
25
+ "content": [
26
+ {"type": "text", "text": "What text is in this image?"},
27
+ {
28
+ "type": "image_url",
29
+ "image_url": {
30
+ "url": f"data:image/jpeg;base64,{encoded_image}"
31
+ }
32
+ }
33
+ ]
34
+ }
35
+ ],
36
+ max_tokens=300
37
+ )
38
+
39
+ extracted_text = response.choices[0].message.content
40
+
41
+ # Format text for dyslexia-friendly reading
42
+ formatted_text = f"<p style='font-family: Arial, sans-serif; font-size: 18px; line-height: 1.5; font-weight: bold;'>{extracted_text}</p>"
43
+
44
+ # Generate speech from text
45
+ speech_file_path = Path(tempfile.gettempdir()) / "speech.mp3"
46
+ speech_response = client.audio.speech.create(
47
+ model="tts-1",
48
+ voice="nova",
49
+ input=extracted_text
50
+ )
51
+ speech_response.stream_to_file(speech_file_path)
52
+
53
+ return formatted_text, str(speech_file_path)
54
+
55
+ # Gradio interface
56
+ iface = gr.Interface(
57
+ fn=process_image,
58
+ inputs=[
59
+ gr.Image(type="filepath", label="Upload Image")
60
+ ],
61
+ outputs=[
62
+ gr.HTML(label="Extracted and Formatted Text"),
63
+ gr.Audio(label="Text-to-Speech")
64
+ ],
65
+ title="Dyslexia-Friendly Reading Assistant",
66
+ description="Upload an image of text. The app will extract the text, format it for easier reading, and provide an audio version."
67
+ )
68
+
69
+ iface.launch()