Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

minko186 commited on Sep 5, 2024

Commit

eeb907d

1 Parent(s): 24a0ba5

add popup html + report humanizer section

Browse files

Files changed (2) hide show

ai_generate.py +29 -34
app.py +238 -34

ai_generate.py CHANGED Viewed

@@ -85,9 +85,8 @@ Return a citation for every quote across all articles that justify the text. Rem
 The entire text should be wrapped in one cited_text. For References section (if asked by prompt), don't add citations.
 For source id, give a valid integer alone without a key.
 Here are the sources:{context}"""
-xml_prompt = ChatPromptTemplate.from_messages(
-    [("system", xml_system), ("human", "{input}")]
-)
 def format_docs_xml(docs: list[Document]) -> str:
     formatted = []
@@ -106,8 +105,7 @@ def get_doc_content(docs, id):
 def remove_citations(text):
-    text = re.sub(r'<\d+>', '', text)
-    text = re.sub(r'[\d+]', '', text)
     return text
@@ -115,24 +113,24 @@ def display_cited_text(data):
     combined_text = ""
     citations = {}
     # Iterate through the cited_text list
-    if 'cited_text' in data:
-        for item in data['cited_text']:
-            if 'chunk' in item and len(item['chunk']) > 0:
-                chunk_text = item['chunk'][0].get('text')
                 combined_text += chunk_text
                 citation_ids = []
                 # Process the citations for the chunk
-                if len(item['chunk']) > 1 and item['chunk'][1]['citations']:
-                    for c in item['chunk'][1]['citations']:
-                        if c and 'citation' in c:
-                            citation = c['citation']
                             if isinstance(citation, dict) and "source_id" in citation:
-                                citation = citation['source_id']
                             if isinstance(citation, str):
                                 try:
                                     citation_ids.append(int(citation))
                                 except ValueError:
-                                    pass # Handle cases where the string is not a valid integer
             if citation_ids:
                 citation_texts = [f"<{cid}>" for cid in citation_ids]
                 combined_text += " " + "".join(citation_texts)
@@ -144,24 +142,27 @@ def get_citations(data, docs):
     # Initialize variables for the combined text and a dictionary for citations
     citations = {}
     # Iterate through the cited_text list
-    if data.get('cited_text'):
-        for item in data['cited_text']:
             citation_ids = []
-            if 'chunk' in item and len(item['chunk']) > 1 and item['chunk'][1].get('citations'):
-                for c in item['chunk'][1]['citations']:
-                    if c and 'citation' in c:
-                        citation = c['citation']
                         if isinstance(citation, dict) and "source_id" in citation:
-                            citation = citation['source_id']
                         if isinstance(citation, str):
                             try:
                                 citation_ids.append(int(citation))
                             except ValueError:
-                                pass # Handle cases where the string is not a valid integer
             # Store unique citations in a dictionary
             for citation_id in citation_ids:
                 if citation_id not in citations:
-                    citations[citation_id] = {'source': docs[citation_id].metadata['source'], 'content': docs[citation_id].page_content}
     return citations
@@ -243,16 +244,12 @@ def generate_rag(
     docs = retriever.get_relevant_documents(topic)
     formatted_docs = format_docs_xml(docs)
-    rag_chain = (
-        RunnablePassthrough.assign(context=lambda _: formatted_docs)
-        | xml_prompt
-        | llm
-        | XMLOutputParser()
-    )
     result = rag_chain.invoke({"input": prompt})
     citations = get_citations(result, docs)
     return result, citations
 def generate_base(
     prompt: str, topic: str, model: str, temperature: float, max_length: int, api_key: str, sys_message=""
 ):
@@ -262,9 +259,7 @@ def generate_base(
         return None, None
     try:
         output = llm.invoke(prompt).content
-        output_dict = {'cited_text': [
-            {'chunk': [{'text': output}, {'citations': None}]}
-        ]}
         return output_dict, None
     except Exception as e:
         print(f"An error occurred while running the model: {e}")
@@ -285,4 +280,4 @@ def generate(
     if path or url_content:
         return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
     else:
-        return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)

 The entire text should be wrapped in one cited_text. For References section (if asked by prompt), don't add citations.
 For source id, give a valid integer alone without a key.
 Here are the sources:{context}"""
+xml_prompt = ChatPromptTemplate.from_messages([("system", xml_system), ("human", "{input}")])
 def format_docs_xml(docs: list[Document]) -> str:
     formatted = []
 def remove_citations(text):
+    text = re.sub(r"<\d+>", "", text)
     return text
     combined_text = ""
     citations = {}
     # Iterate through the cited_text list
+    if "cited_text" in data:
+        for item in data["cited_text"]:
+            if "chunk" in item and len(item["chunk"]) > 0:
+                chunk_text = item["chunk"][0].get("text")
                 combined_text += chunk_text
                 citation_ids = []
                 # Process the citations for the chunk
+                if len(item["chunk"]) > 1 and item["chunk"][1]["citations"]:
+                    for c in item["chunk"][1]["citations"]:
+                        if c and "citation" in c:
+                            citation = c["citation"]
                             if isinstance(citation, dict) and "source_id" in citation:
+                                citation = citation["source_id"]
                             if isinstance(citation, str):
                                 try:
                                     citation_ids.append(int(citation))
                                 except ValueError:
+                                    pass  # Handle cases where the string is not a valid integer
             if citation_ids:
                 citation_texts = [f"<{cid}>" for cid in citation_ids]
                 combined_text += " " + "".join(citation_texts)
     # Initialize variables for the combined text and a dictionary for citations
     citations = {}
     # Iterate through the cited_text list
+    if data.get("cited_text"):
+        for item in data["cited_text"]:
             citation_ids = []
+            if "chunk" in item and len(item["chunk"]) > 1 and item["chunk"][1].get("citations"):
+                for c in item["chunk"][1]["citations"]:
+                    if c and "citation" in c:
+                        citation = c["citation"]
                         if isinstance(citation, dict) and "source_id" in citation:
+                            citation = citation["source_id"]
                         if isinstance(citation, str):
                             try:
                                 citation_ids.append(int(citation))
                             except ValueError:
+                                pass  # Handle cases where the string is not a valid integer
             # Store unique citations in a dictionary
             for citation_id in citation_ids:
                 if citation_id not in citations:
+                    citations[citation_id] = {
+                        "source": docs[citation_id].metadata["source"],
+                        "content": docs[citation_id].page_content,
+                    }
     return citations
     docs = retriever.get_relevant_documents(topic)
     formatted_docs = format_docs_xml(docs)
+    rag_chain = RunnablePassthrough.assign(context=lambda _: formatted_docs) | xml_prompt | llm | XMLOutputParser()
     result = rag_chain.invoke({"input": prompt})
     citations = get_citations(result, docs)
     return result, citations
 def generate_base(
     prompt: str, topic: str, model: str, temperature: float, max_length: int, api_key: str, sys_message=""
 ):
         return None, None
     try:
         output = llm.invoke(prompt).content
+        output_dict = {"cited_text": [{"chunk": [{"text": output}, {"citations": None}]}]}
         return output_dict, None
     except Exception as e:
         print(f"An error occurred while running the model: {e}")
     if path or url_content:
         return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
     else:
+        return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)

app.py CHANGED Viewed

@@ -22,6 +22,11 @@ from humanize import humanize_text, device
 from ai_generate import generate, citations_to_html, remove_citations, display_cited_text
 import nltk
 nltk.download("punkt_tab")
 print(f"Using device: {device}")
@@ -43,6 +48,161 @@ tokenizers = {
 tool = language_tool_python.LanguageTool("en-US")
 # Function to move model to the appropriate device
 def to_device(model):
     return model.to(device)
@@ -256,7 +416,8 @@ def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
     return score, text, mc_score
-def ai_check(text: str, option: str):
     if option.startswith("Polygraf AI"):
         return highlighter_polygraf(text, option)
     else:
@@ -382,15 +543,15 @@ def generate_article(
         api_key=api_key,
         sys_message="",
     )
-    return article, citations_to_html(citations)
 def get_history(history):
-    return history
-    # history_formatted = []
-    # for entry in history:
-    #     history_formatted.append((entry[0], display_cited_text(entry[1])))
-    # return history_formatted
 def clear_history():
@@ -409,8 +570,9 @@ def humanize(
 ) -> str:
     print("Humanizing text...")
     # body, references = split_text_from_refs(text)
-    # cited_text = history[-1][1]
-    result = humanize_text(
         text=cited_text,
         model_name=model,
         temperature=temperature,
@@ -420,9 +582,22 @@ def humanize(
     )
     # result = result + references
     # corrected_text = format_and_correct_language_check(result)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    history.append((f"Humanized Text | {timestamp}\nInput: {model}", clean_text(result)))
-    return clean_text(result), history
 def update_visibility_api(model: str):
@@ -458,11 +633,6 @@ def update_temperature(model_dropdown):
         return gr.update(value=1.0, interactive=True)
-import uuid
-import json
-from datetime import datetime
-from google.cloud import storage
 # Initialize Google Cloud Storage client
 client = storage.Client()
 bucket_name = "ai-source-detection"
@@ -537,6 +707,31 @@ def save_to_cloud_storage(
     return f"Data saved as {file_name} in GCS."
 def generate_and_format(
     input_role,
     topic,
@@ -610,9 +805,10 @@ def generate_and_format(
     #     for url in url_content.keys():
     #         article += f"\n{url}"
     # reference_formatted = format_references(article)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    history.append((f"Generated Text | {timestamp}\nInput: {topic}", clean_text(display_cited_text(article))))
     # Save the article and metadata to Cloud Storage
     # We dont save if there is PDF input for privacy reasons
@@ -641,8 +837,7 @@ def generate_and_format(
             timestamp,
         )
         print(save_message)
-    return clean_text(display_cited_text(article)), citations, history
 def create_interface():
@@ -655,6 +850,7 @@ def create_interface():
             """,
     ) as demo:
         history = gr.State([])
         today = date.today()
         # dd/mm/YY
         d1 = today.strftime("%d/%B/%Y")
@@ -869,15 +1065,20 @@ def create_interface():
             with gr.Column(scale=3):
                 with gr.Tab("Text Generator"):
-                    output_article = gr.Textbox(label="Generated Article", lines=20)
-                    with gr.Accordion("Citations", open=True):
-                        output_citations = gr.HTML(label="Citations")
-                    ai_comments = gr.Textbox(
-                        label="Add comments to help edit generated text", interactive=True, visible=False
                     )
-                    regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=False)
-                    ai_detector_dropdown = gr.Radio(
-                        choices=ai_check_options, label="Select AI Detector", value="Polygraf AI"
                     )
                     ai_check_btn = gr.Button("AI Check")
@@ -939,15 +1140,18 @@ def create_interface():
                 return gr.update(visible=False)
         google_search_check.change(search_visible, inputs=google_search_check, outputs=search_options)
-        ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
-        output_article.change(regenerate_visible, inputs=output_article, outputs=ai_comments)
-        ai_comments.change(regenerate_visible, inputs=output_article, outputs=regenerate_btn)
         ai_check_btn.click(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
         # Update the default structure based on the selected format
         # e.g. "Plain Text" for certain formats
         input_format.change(fn=update_structure, inputs=input_format, outputs=input_structure)
         model_dropdown.change(fn=update_temperature, inputs=model_dropdown, outputs=temperature_slider)
         generate_btn.click(
             fn=generate_and_format,
@@ -981,7 +1185,7 @@ def create_interface():
                 pdf_file_input,
                 history,
             ],
-            outputs=[output_article, output_citations, history],
         )
         regenerate_btn.click(
@@ -1018,12 +1222,12 @@ def create_interface():
                 exclude_sites,
                 ai_comments,
             ],
-            outputs=[output_article, output_citations, history],
         )
         ai_check_btn.click(
             fn=ai_check,
-            inputs=[output_article, ai_detector_dropdown],
             outputs=[ai_check_result, highlighted_text, mc_check_result],
         )
@@ -1038,7 +1242,7 @@ def create_interface():
                 length_penalty_slider,
                 history,
             ],
-            outputs=[output_article, history],
         )
         generate_btn.click(get_history, inputs=[history], outputs=[history_chat])

 from ai_generate import generate, citations_to_html, remove_citations, display_cited_text
 import nltk
+import uuid
+import json
+from datetime import datetime
+from google.cloud import storage
 nltk.download("punkt_tab")
 print(f"Using device: {device}")
 tool = language_tool_python.LanguageTool("en-US")
+def generate_cited_html(cited_text, citations: dict):
+    cited_text = cited_text.replace("\n", "<br>")
+    html_code = """
+    <style>
+      .reference-container {
+        position: relative;
+        display: inline-block;
+      }
+      .reference-btn {
+        display: inline-block;
+        width: 25px;
+        height: 25px;
+        border-radius: 50%;
+        background-color: #0000EE; /* Blue color for the button */
+        color: white;
+        text-align: center;
+        line-height: 25px;
+        cursor: pointer;
+        font-weight: bold;
+        margin-right: 5px;
+        transition: background-color 0.3s ease, transform 0.3s ease;
+      }
+      .reference-btn:hover {
+        background-color: #1e90ff; /* Lighter blue on hover */
+        transform: scale(1.1); /* Slightly enlarge on hover */
+      }
+      .reference-popup {
+        display: none;
+        position: absolute;
+        z-index: 1;
+        top: 100%;
+        left: 0;
+        background-color: #f9f9f9;
+        border: 1px solid #ddd;
+        padding: 10px;
+        border-radius: 4px;
+        box-shadow: 0 2px 5px rgba(0,0,0,0.2);
+        width: calc(min(90vw, 500px));
+        max-height: calc(min(80vh, 300px));
+        overflow-y: auto;
+        transform: translateX(0); /* Default position */
+      }
+      .reference-container .reference-popup {
+        left: 50%;
+        transform: translateX(-50%); /* Center align popup horizontally by default */
+        }
+      .reference-container[data-align="right"] .reference-popup {
+        left: auto;
+        right: 0;
+        transform: translateX(-10%); /* Pull the popup slightly left when near right edge */
+        }
+      .reference-popup .close-btn {
+        float: right;
+        cursor: pointer;
+        font-weight: bold;
+        color: white;
+        font-size: 16px;
+        padding: 0;
+        width: 20px;
+        height: 20px;
+        text-align: center;
+        line-height: 20px;
+        background-color: #ff4c4c;
+        border-radius: 2px;
+        transition: transform 0.3s ease, background-color 0.3s ease;
+      }
+      .reference-popup .close-btn:hover {
+        transform: scale(1.2);
+        background-color: #ff3333;
+      }
+      input[type="radio"] {
+        position: absolute;
+        opacity: 0;
+        pointer-events: none;
+      }
+      input[type="radio"]:checked + .reference-popup {
+        display: block;
+      }
+      @media (prefers-color-scheme: dark) {
+        .reference-btn {
+          background-color: #1e90ff;
+        }
+        .reference-popup {
+          background-color: #2c2c2c;
+          border-color: #444;
+          color: #f1f1f1;
+        }
+        .reference-popup .close-btn {
+          background-color: #ff4c4c;
+        }
+        .reference-popup .close-btn:hover {
+          background-color: #ff3333;
+        }
+      }
+    </style>
+    <script>
+    document.addEventListener('click', (event) => {
+        const containers = document.querySelectorAll('.reference-container');
+        containers.forEach(container => {
+          const rect = container.getBoundingClientRect();
+          if (rect.right > window.innerWidth - 50) {
+            container.setAttribute('data-align', 'right');
+          } else if (rect.left < 50) {
+            container.setAttribute('data-align', 'left');
+          } else {
+            container.removeAttribute('data-align');
+          }
+        });
+      });
+    function closeReferencePanes(event) {
+        if (!event.target.closest('.reference-container')) {
+          const checkboxes = document.querySelectorAll('input[type="radio"]');
+          checkboxes.forEach(checkbox => checkbox.checked = false);
+        }
+      }
+      document.addEventListener('click', closeReferencePanes);
+    </script>
+    <div style="height: 600px; overflow-y: auto;">
+    """
+    # Function to replace each citation with a reference button
+    def replace_citations(match):
+        citation_id = match.group(1)  # Extract citation number from the match
+        ref_data = citations.get(int(citation_id))
+        # If reference data is not found, return the original text
+        if not ref_data:
+            return match.group(0)
+        # Split the content by newlines and wrap each in <p> tags to maintain paragraph structure
+        paragraphs = ref_data["content"].split("\n")
+        formatted_content = "".join(f"<p>{para.strip()}</p>" for para in paragraphs if para.strip())
+        # HTML code for the reference button and popup with paragraph formatting
+        button_html = f"""
+        <span class="reference-container">
+        <label for="ref-toggle-{citation_id}" class="reference-btn" onclick="closeReferencePanes(); document.getElementById('ref-toggle-{citation_id}').checked = true;">{citation_id}</label>
+        <input type="radio" id="ref-toggle-{citation_id}" name="reference" />
+        <span class="reference-popup">
+            <span class="close-btn" onclick="document.getElementById('ref-toggle-{citation_id}').checked = false;">&times;</span>
+            <strong>Source:</strong> {ref_data['source']}<br>
+            <strong>Content:</strong> {formatted_content}
+        </span>
+        </span>
+        """
+        return button_html
+    # Replace inline citations in the text with the generated HTML
+    html_code += re.sub(r"<(\d+)>", replace_citations, cited_text)
+    html_code += "</div>"
+    return html_code
 # Function to move model to the appropriate device
 def to_device(model):
     return model.to(device)
     return score, text, mc_score
+def ai_check(history: list, option: str):
+    text = history[-1][1]
     if option.startswith("Polygraf AI"):
         return highlighter_polygraf(text, option)
     else:
         api_key=api_key,
         sys_message="",
     )
+    return article, citations
 def get_history(history):
+    # return history
+    history_formatted = []
+    for entry in history:
+        history_formatted.append((entry[0], entry[1]))
+    return history_formatted
 def clear_history():
 ) -> str:
     print("Humanizing text...")
     # body, references = split_text_from_refs(text)
+    cited_text = history[-1][1]
+    citations = history[-1][2]
+    article = humanize_text(
         text=cited_text,
         model_name=model,
         temperature=temperature,
     )
     # result = result + references
     # corrected_text = format_and_correct_language_check(result)
+    article = clean_text(article)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    history.append((f"Humanized Text | {timestamp}\nInput: {model}", article, citations))
+    latest_humanizer_data = {
+        "original text": cited_text,
+        "humanized text": article,
+        "citations": citations,  # can remove saving citations
+        "metadata": {
+            "temperature": temperature,
+            "repetition_penalty": repetition_penalty,
+            "top_k": top_k,
+            "length_penalty": length_penalty,
+        },
+        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+    }
+    return generate_cited_html(article, citations), history, latest_humanizer_data
 def update_visibility_api(model: str):
         return gr.update(value=1.0, interactive=True)
 # Initialize Google Cloud Storage client
 client = storage.Client()
 bucket_name = "ai-source-detection"
     return f"Data saved as {file_name} in GCS."
+def save_humanizer_feedback_to_cloud_storage(data, humanizer_feedback):
+    """Save generated article and metadata to Google Cloud Storage within a specific folder."""
+    if data:
+        try:
+            data["user_feedback"] = humanizer_feedback
+            # Create a unique filename
+            file_id = str(uuid.uuid4())
+            # Define the file path and name in the bucket
+            folder_path = "ai-writer/humanizer-feedback/"
+            file_name = f"{folder_path}{data['timestamp'].replace(' ', '_').replace(':', '-')}_{file_id}.json"
+            # Convert data to JSON string
+            json_data = json.dumps(data)
+            # Create a blob and upload to GCS
+            blob = bucket.blob(file_name)
+            blob.upload_from_string(json_data, content_type="application/json")
+            gr.Info("Successfully reported. Thank you for the feedback!")
+        except Exception:
+            gr.Warning("Report not saved.")
+    else:
+        gr.Warning("Nothing humanized to save yet!")
 def generate_and_format(
     input_role,
     topic,
     #     for url in url_content.keys():
     #         article += f"\n{url}"
+    article = clean_text(display_cited_text(article))
     # reference_formatted = format_references(article)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    history.append((f"Generated Text | {timestamp}\nInput: {topic}", article, citations))
     # Save the article and metadata to Cloud Storage
     # We dont save if there is PDF input for privacy reasons
             timestamp,
         )
         print(save_message)
+    return generate_cited_html(article, citations), history
 def create_interface():
             """,
     ) as demo:
         history = gr.State([])
+        latest_humanizer_data = gr.State()
         today = date.today()
         # dd/mm/YY
         d1 = today.strftime("%d/%B/%Y")
             with gr.Column(scale=3):
                 with gr.Tab("Text Generator"):
+                    output_article = gr.HTML(
+                        value="""<div style="height: 600px;"></div>""",
+                        label="Generated Article",
                     )
+                    humanizer_feedback = gr.Textbox(label="Add optional feedback on humanizer")
+                    report_humanized_btn = gr.Button("Report Humanized Text", variant="primary", visible=True)
+                    with gr.Accordion("Regenerate Article", open=False):
+                        ai_comments = gr.Textbox(
+                            label="Add comments to help edit generated text", interactive=True, visible=True
+                        )
+                        regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=True)
+                    ai_detector_dropdown = gr.Dropdown(
+                        choices=ai_check_options, label="Select AI Detector", value="Polygraf AI (Base Model)"
                     )
                     ai_check_btn = gr.Button("AI Check")
                 return gr.update(visible=False)
         google_search_check.change(search_visible, inputs=google_search_check, outputs=search_options)
+        # ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
+        # output_article.change(regenerate_visible, inputs=output_article, outputs=ai_comments)
+        # ai_comments.change(regenerate_visible, inputs=output_article, outputs=regenerate_btn)
         ai_check_btn.click(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
         # Update the default structure based on the selected format
         # e.g. "Plain Text" for certain formats
         input_format.change(fn=update_structure, inputs=input_format, outputs=input_structure)
         model_dropdown.change(fn=update_temperature, inputs=model_dropdown, outputs=temperature_slider)
+        report_humanized_btn.click(
+            save_humanizer_feedback_to_cloud_storage, inputs=[latest_humanizer_data, humanizer_feedback]
+        )
         generate_btn.click(
             fn=generate_and_format,
                 pdf_file_input,
                 history,
             ],
+            outputs=[output_article, history],
         )
         regenerate_btn.click(
                 exclude_sites,
                 ai_comments,
             ],
+            outputs=[output_article, history],
         )
         ai_check_btn.click(
             fn=ai_check,
+            inputs=[history, ai_detector_dropdown],
             outputs=[ai_check_result, highlighted_text, mc_check_result],
         )
                 length_penalty_slider,
                 history,
             ],
+            outputs=[output_article, history, latest_humanizer_data],
         )
         generate_btn.click(get_history, inputs=[history], outputs=[history_chat])