docs_to_synthetic_qa

Sleeping

App Files Files Community

mvansegbroeck commited on Jun 7

Commit

4af6426

•

1 Parent(s): a444f06

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -71

app.py CHANGED Viewed

@@ -9,8 +9,8 @@ import random
 from gretel_client import Gretel
 from gretel_client.config import GretelClientConfigurationError
-# Directory for saving processed PDFs
-output_dir = 'processed_pdfs'
 os.makedirs(output_dir, exist_ok=True)
 # Function to download and convert a PDF to text
@@ -22,6 +22,16 @@ def pdf_to_text(pdf_path):
         text += page.get_text()
     return text
 # Function to split text into chunks
 def split_text_into_chunks(text, chunk_size=25, chunk_overlap=5, min_chunk_chars=50):
     text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
@@ -38,76 +48,82 @@ def save_chunks(file_id, chunks, output_dir):
 # Function to read chunks from files
 def read_chunks_from_files(output_dir):
-    pdf_chunks_dict = {}
     for filename in os.listdir(output_dir):
         if filename.endswith('.md') and 'chunk' in filename:
             file_id = filename.split('_chunk_')[0]
             chunk_path = os.path.join(output_dir, filename)
             with open(chunk_path, 'r') as file:
                 chunk = file.read()
-            if file_id not in pdf_chunks_dict:
-                pdf_chunks_dict[file_id] = []
-            pdf_chunks_dict[file_id].append(chunk)
-    return pdf_chunks_dict
-def process_pdfs(uploaded_files, use_example, chunk_size, chunk_overlap, min_chunk_chars, current_chunk, direction):
-    selected_pdfs = []
     if use_example:
         example_file_url = "https://gretel-datasets.s3.us-west-2.amazonaws.com/rag/GDPR_2016.pdf"
-        pdf_path = os.path.join(output_dir, example_file_url.split('/')[-1])
-        if not os.path.exists(pdf_path):
             response = requests.get(example_file_url)
-            with open(pdf_path, 'wb') as file:
                 file.write(response.content)
-        selected_pdfs = [pdf_path]
     elif uploaded_files is not None:
         for uploaded_file in uploaded_files:
-            pdf_path = os.path.join(output_dir, uploaded_file.name)
-            selected_pdfs.append(pdf_path)
     else:
-        chunk_text = "No PDFs processed"
         return None, 0, chunk_text, None
-    pdf_chunks_dict = {}
-    for pdf_path in selected_pdfs:
-        text = pdf_to_text(pdf_path)
         markdown_text = markdownify.markdownify(text)
-        file_id = os.path.splitext(os.path.basename(pdf_path))[0]
         markdown_path = os.path.join(output_dir, f"{file_id}.md")
         with open(markdown_path, 'w') as file:
             file.write(markdown_text)
         chunks = split_text_into_chunks(markdown_text, chunk_size=chunk_size, chunk_overlap=chunk_overlap, min_chunk_chars=min_chunk_chars)
         save_chunks(file_id, chunks, output_dir)
-        pdf_chunks_dict[file_id] = chunks
-    file_id = os.path.splitext(os.path.basename(selected_pdfs[0]))[0]
-    chunks = pdf_chunks_dict.get(file_id, [])
     current_chunk += direction
     if current_chunk < 0:
         current_chunk = 0
-    elif current_chunk >= len(chunks):
-        current_chunk = len(chunks) - 1
-    chunk_text = chunks[current_chunk] if chunks else "No chunks available."
-    return pdf_chunks_dict, selected_pdfs, chunk_text, current_chunk#, use_example_update
-def show_chunks(pdf_chunks_dict, selected_pdfs, current_chunk, direction):
-    if selected_pdfs:
-        file_id = os.path.splitext(os.path.basename(selected_pdfs[0]))[0]
-        chunks = pdf_chunks_dict.get(file_id, [])
-        current_chunk += direction
-        if current_chunk < 0:
-            current_chunk = 0
-        elif current_chunk >= len(chunks):
-            current_chunk = len(chunks) - 1
-        chunk_text = chunks[current_chunk] if chunks else "No chunks available."
-        return chunk_text, current_chunk
-    else:
-        return "No PDF processed.", 0
 # Validate API key and return button state
 def check_api_key(api_key):
@@ -120,7 +136,7 @@ def check_api_key(api_key):
         status_message = "Invalid"
     return gr.update(interactive=is_valid), status_message
-def generate_synthetic_records(api_key, pdf_chunks_dict, num_records):
     gretel = Gretel(api_key=api_key, validate=True, clear=True)
@@ -146,10 +162,30 @@ def generate_synthetic_records(api_key, pdf_chunks_dict, num_records):
         "top_k": 40
     }
     df_in = pd.DataFrame()
     try:
-        documents = list(pdf_chunks_dict.keys())
-        all_chunks = [(doc, chunk) for doc in documents for chunk in pdf_chunks_dict[doc]]
         for _ in range(num_records):
             doc, chunk = random.choice(all_chunks)
@@ -158,7 +194,13 @@ def generate_synthetic_records(api_key, pdf_chunks_dict, num_records):
         df = navigator.edit(PROMPT, seed_data=df_in, **GENERATE_PARAMS)
         df = df.drop(columns=['text'])
         csv_file = os.path.join(output_dir, "synthetic_qa.csv")
         df.to_csv(csv_file, index=False)
@@ -173,7 +215,7 @@ def download_dataframe(df):
     return csv_file
 # CSS styling to center the logo and prevent right-click download
-css = """
 <style>
 #logo-container {
     display: flex;
@@ -188,7 +230,7 @@ css = """
 # HTML content to include the logo
 html_content = f"""
-{css}
 <div id="logo-container">
     <svg width="181" height="72" viewBox="0 0 181 72" fill="none" xmlns="http://www.w3.org/2000/svg">
     <g clip-path="url(#clip0_849_78)">
@@ -210,37 +252,40 @@ html_content = f"""
 </div>
 """
 # Gradio interface
-with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column(scale=3):
-            # gr.Markdown("# Upload PDFs")
-            # gr.Image("gretel_logo.svg", elem_id="logo", show_label=False)
             gr.HTML(html_content)
-            with gr.Tab("Upload PDF"):
                 use_example = gr.Checkbox(label="Continue with Example PDF", value=False, interactive=True)
-                uploaded_files = gr.File(label="Upload your PDF files", file_count="multiple")
-                # if uploaded_files:
-                    # use_example = gr.Checkbox(label="Continue with Example PDF", value=False, interactive=False)
             chunk_size = gr.Slider(label="Chunk Size (tokens)", minimum=10, maximum=1500, step=10, value=500)
             chunk_overlap = gr.Slider(label="Chunk Overlap (tokens)", minimum=0, maximum=500, step=5, value=100)
             min_chunk_chars = gr.Slider(label="Minimum Chunk Characters", minimum=10, maximum=2500, step=10, value=750)
-            process_button = gr.Button("Process PDFs")
-            pdf_chunks_dict = gr.State()
-            selected_pdfs = gr.State()
             current_chunk = gr.State(value=0)
             chunk_text = gr.Textbox(label="Chunk Text", lines=10)
             def toggle_use_example(file_list):
                 return gr.update(
-                    value = False,
                     interactive=file_list is None or len(file_list) == 0
-                    )
             uploaded_files.change(
                 toggle_use_example,
@@ -249,9 +294,9 @@ with gr.Blocks() as demo:
             )
             process_button.click(
-                process_pdfs,
                 inputs=[uploaded_files, use_example, chunk_size, chunk_overlap, min_chunk_chars, current_chunk, gr.State(0)],
-                outputs=[pdf_chunks_dict, selected_pdfs, chunk_text, current_chunk]
             )
             with gr.Row():
@@ -260,13 +305,13 @@ with gr.Blocks() as demo:
             prev_button.click(
                 show_chunks,
-                inputs=[pdf_chunks_dict, selected_pdfs, current_chunk, gr.State(-1)],
                 outputs=[chunk_text, current_chunk]
             )
             next_button.click(
                 show_chunks,
-                inputs=[pdf_chunks_dict, selected_pdfs, current_chunk, gr.State(1)],
                 outputs=[chunk_text, current_chunk]
             )
@@ -277,28 +322,26 @@ with gr.Blocks() as demo:
                 api_key_input = gr.Textbox(label="Gretel API Key (available at https://console.gretel.ai)", type="password", placeholder="Enter your API key", scale=2)
                 validate_status = gr.Textbox(label="Validation Status", interactive=False, scale=1)
-            # User-specific settings
             num_records = gr.Number(label="Number of Records", value=10)
             generate_button = gr.Button("Generate Synthetic Records", interactive=False)
             download_link = gr.File(label="Download Link", visible=False)
-            # Validate API key on input change and update button interactivity
             api_key_input.change(
                 fn=check_api_key,
                 inputs=[api_key_input],
                 outputs=[generate_button, validate_status]
             )
-            output_df = gr.Dataframe(headers=["document", "topic", "user_profile", "question", "answer", "context"], wrap=True, visible=True)
-            def generate_and_prepare_download(api_key, pdf_chunks_dict, num_records):
-                df, csv_file = generate_synthetic_records(api_key, pdf_chunks_dict, num_records)
                 return df, gr.update(value=csv_file, visible=df['value']!=None)
             generate_button.click(
                 fn=generate_and_prepare_download,
-                inputs=[api_key_input, pdf_chunks_dict, num_records],
                 outputs=[output_df, download_link]
             )

 from gretel_client import Gretel
 from gretel_client.config import GretelClientConfigurationError
+# Directory for saving processed files
+output_dir = 'processed_files'
 os.makedirs(output_dir, exist_ok=True)
 # Function to download and convert a PDF to text
         text += page.get_text()
     return text
+# Function to read a TXT file
+def txt_to_text(txt_path):
+    with open(txt_path, 'r') as file:
+        return file.read()
+# Function to read a Markdown file
+def markdown_to_text(md_path):
+    with open(md_path, 'r') as file:
+        return file.read()
 # Function to split text into chunks
 def split_text_into_chunks(text, chunk_size=25, chunk_overlap=5, min_chunk_chars=50):
     text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
 # Function to read chunks from files
 def read_chunks_from_files(output_dir):
+    chunks_dict = {}
     for filename in os.listdir(output_dir):
         if filename.endswith('.md') and 'chunk' in filename:
             file_id = filename.split('_chunk_')[0]
             chunk_path = os.path.join(output_dir, filename)
             with open(chunk_path, 'r') as file:
                 chunk = file.read()
+            if file_id not in chunks_dict:
+                chunks_dict[file_id] = []
+            chunks_dict[file_id].append(chunk)
+    return chunks_dict
+def process_files(uploaded_files, use_example, chunk_size, chunk_overlap, min_chunk_chars, current_chunk, direction):
+    selected_files = []
     if use_example:
         example_file_url = "https://gretel-datasets.s3.us-west-2.amazonaws.com/rag/GDPR_2016.pdf"
+        file_path = os.path.join(output_dir, example_file_url.split('/')[-1])
+        if not os.path.exists(file_path):
             response = requests.get(example_file_url)
+            with open(file_path, 'wb') as file:
                 file.write(response.content)
+        selected_files = [file_path]
     elif uploaded_files is not None:
         for uploaded_file in uploaded_files:
+            file_path = os.path.join(output_dir, uploaded_file.name)
+            # with open(file_path, 'wb') as file:
+                # file.write(uploaded_file.read())
+            selected_files.append(file_path)
     else:
+        chunk_text = "No files processed"
         return None, 0, chunk_text, None
+    chunks_dict = {}
+    for file_path in selected_files:
+        file_extension = os.path.splitext(file_path)[1].lower()
+        if file_extension == '.pdf':
+            text = pdf_to_text(file_path)
+        elif file_extension == '.txt':
+            text = txt_to_text(file_path)
+        elif file_extension == '.md':
+            text = markdown_to_text(file_path)
+        else:
+            text = ""
         markdown_text = markdownify.markdownify(text)
+        file_id = os.path.splitext(os.path.basename(file_path))[0]
         markdown_path = os.path.join(output_dir, f"{file_id}.md")
         with open(markdown_path, 'w') as file:
             file.write(markdown_text)
         chunks = split_text_into_chunks(markdown_text, chunk_size=chunk_size, chunk_overlap=chunk_overlap, min_chunk_chars=min_chunk_chars)
         save_chunks(file_id, chunks, output_dir)
+        chunks_dict[file_id + file_extension] = chunks
+    all_chunks = [chunk for chunks in chunks_dict.values() for chunk in chunks]
     current_chunk += direction
     if current_chunk < 0:
         current_chunk = 0
+    elif current_chunk >= len(all_chunks):
+        current_chunk = len(all_chunks) - 1
+    chunk_text = all_chunks[current_chunk] if all_chunks else "No chunks available."
+    return chunks_dict, selected_files, chunk_text, current_chunk#, use_example_update
+def show_chunks(chunks_dict, selected_files, current_chunk, direction):
+    all_chunks = [chunk for chunks in chunks_dict.values() for chunk in chunks]
+    current_chunk += direction
+    if current_chunk < 0:
+        current_chunk = 0
+    elif current_chunk >= len(all_chunks):
+        current_chunk = len(all_chunks) - 1
+    chunk_text = all_chunks[current_chunk] if all_chunks else "No chunks available."
+    return chunk_text, current_chunk
 # Validate API key and return button state
 def check_api_key(api_key):
         status_message = "Invalid"
     return gr.update(interactive=is_valid), status_message
+def generate_synthetic_records(api_key, chunks_dict, num_records):
     gretel = Gretel(api_key=api_key, validate=True, clear=True)
         "top_k": 40
     }
+    EVAL_PROMPT = """
+    Add the following columns to the provided table based on the generated text in the `answer` column:
+    * conformance: A rating from 'Poor' to 'Excellent' indicating the conformance of the generated text to the requested format, tags, and descriptions provided.
+    - 'Poor', 'Fair', 'Good', 'Very Good', 'Excellent'
+    * quality: A rating from 'Poor' to 'Excellent' based on the grammatical correctness, coherence, and relevance of the generated text.
+    - 'Poor', 'Fair', 'Good', 'Very Good', 'Excellent'
+    * toxicity: A rating from 'Non-Toxic' to 'Highly Toxic' indicating the level of toxic content in the generated text.
+    - 'Non-Toxic', 'Moderately Toxic', 'Highly Toxic'
+    * bias: A rating from 'Unbiased' to 'Heavily Biased' indicating the level of unintended biases in the generated text.
+    - 'Unbiased', 'Moderately Biased', 'Heavily Biased'
+    * groundedness: A rating from 'Ungrounded' to 'Fully Grounded' indicating the level of factual correctness in the generated text.
+    - 'Ungrounded', 'Moderately Grounded', 'Fully Grounded'
+    """
+    EVAL_GENERATE_PARAMS = {
+        "temperature": 0.2,
+        "top_p": 0.5,
+        "top_k": 40
+    }
     df_in = pd.DataFrame()
     try:
+        documents = list(chunks_dict.keys())
+        all_chunks = [(doc, chunk) for doc in documents for chunk in chunks_dict[doc]]
         for _ in range(num_records):
             doc, chunk = random.choice(all_chunks)
         df = navigator.edit(PROMPT, seed_data=df_in, **GENERATE_PARAMS)
         df = df.drop(columns=['text'])
+        df = navigator.edit(EVAL_PROMPT, seed_data=df, **EVAL_GENERATE_PARAMS)
+        df.rename(columns={
+            "question": "synthetic_question",
+            "answer": "synthetic_answer",
+            "context": "original_context"
+        }, inplace=True)
         csv_file = os.path.join(output_dir, "synthetic_qa.csv")
         df.to_csv(csv_file, index=False)
     return csv_file
 # CSS styling to center the logo and prevent right-click download
+logo_css = """
 <style>
 #logo-container {
     display: flex;
 # HTML content to include the logo
 html_content = f"""
+{logo_css}
 <div id="logo-container">
     <svg width="181" height="72" viewBox="0 0 181 72" fill="none" xmlns="http://www.w3.org/2000/svg">
     <g clip-path="url(#clip0_849_78)">
 </div>
 """
+# Define custom CSS to set the font size
+css = """
+#small span{
+ font-size: 0.8em;
+}
+"""
 # Gradio interface
+with gr.Blocks(css=css) as demo:
     with gr.Row():
         with gr.Column(scale=3):
             gr.HTML(html_content)
+            with gr.Tab("Upload Files"):
                 use_example = gr.Checkbox(label="Continue with Example PDF", value=False, interactive=True)
+                uploaded_files = gr.File(label="Upload your files (TXT, Markdown, or PDF)", file_count="multiple", file_types=[".pdf", ".txt", ".md"])
             chunk_size = gr.Slider(label="Chunk Size (tokens)", minimum=10, maximum=1500, step=10, value=500)
             chunk_overlap = gr.Slider(label="Chunk Overlap (tokens)", minimum=0, maximum=500, step=5, value=100)
             min_chunk_chars = gr.Slider(label="Minimum Chunk Characters", minimum=10, maximum=2500, step=10, value=750)
+            process_button = gr.Button("Process Files")
+            chunks_dict = gr.State()
+            selected_files = gr.State()
             current_chunk = gr.State(value=0)
             chunk_text = gr.Textbox(label="Chunk Text", lines=10)
             def toggle_use_example(file_list):
                 return gr.update(
+                    value=False,
                     interactive=file_list is None or len(file_list) == 0
+                )
             uploaded_files.change(
                 toggle_use_example,
             )
             process_button.click(
+                process_files,
                 inputs=[uploaded_files, use_example, chunk_size, chunk_overlap, min_chunk_chars, current_chunk, gr.State(0)],
+                outputs=[chunks_dict, selected_files, chunk_text, current_chunk]
             )
             with gr.Row():
             prev_button.click(
                 show_chunks,
+                inputs=[chunks_dict, selected_files, current_chunk, gr.State(-1)],
                 outputs=[chunk_text, current_chunk]
             )
             next_button.click(
                 show_chunks,
+                inputs=[chunks_dict, selected_files, current_chunk, gr.State(1)],
                 outputs=[chunk_text, current_chunk]
             )
                 api_key_input = gr.Textbox(label="Gretel API Key (available at https://console.gretel.ai)", type="password", placeholder="Enter your API key", scale=2)
                 validate_status = gr.Textbox(label="Validation Status", interactive=False, scale=1)
             num_records = gr.Number(label="Number of Records", value=10)
             generate_button = gr.Button("Generate Synthetic Records", interactive=False)
             download_link = gr.File(label="Download Link", visible=False)
             api_key_input.change(
                 fn=check_api_key,
                 inputs=[api_key_input],
                 outputs=[generate_button, validate_status]
             )
+            output_df = gr.Dataframe(headers=["",], wrap=True, visible=True, elem_id="small")
+            def generate_and_prepare_download(api_key, chunks_dict, num_records):
+                df, csv_file = generate_synthetic_records(api_key, chunks_dict, num_records)
                 return df, gr.update(value=csv_file, visible=df['value']!=None)
             generate_button.click(
                 fn=generate_and_prepare_download,
+                inputs=[api_key_input, chunks_dict, num_records],
                 outputs=[output_df, download_link]
             )