Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

eljanmahammadli commited on Sep 6, 2024

Commit

593bb22

1 Parent(s): ba91632

inline citations and more

Browse files

Files changed (2) hide show

ai_generate.py +104 -10
app.py +20 -14

ai_generate.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 from langchain_community.document_loaders import PyMuPDFLoader
 from langchain_core.documents import Document
@@ -18,6 +19,15 @@ from dotenv import load_dotenv
 from langchain_core.output_parsers import XMLOutputParser
 from langchain.prompts import ChatPromptTemplate
 import re
 load_dotenv()
@@ -29,7 +39,17 @@ os.environ["GLOG_minloglevel"] = "2"
 CHUNK_SIZE = 1024
 CHUNK_OVERLAP = CHUNK_SIZE // 8
 K = 10
-FETCH_K = 20
 llm_model_translation = {
     "LLaMA 3": "llama3-70b-8192",
@@ -195,10 +215,9 @@ def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int
     return llm
-def create_db_with_langchain(path: list[str], url_content: dict):
     all_docs = []
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
-    embedding_function = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
     if path:
         for file in path:
             loader = PyMuPDFLoader(file)
@@ -214,18 +233,38 @@ def create_db_with_langchain(path: list[str], url_content: dict):
             docs = text_splitter.split_documents([doc])
             all_docs.extend(docs)
     # print docs
     for idx, doc in enumerate(all_docs):
         print(f"Doc: {idx} | Length = {len(doc.page_content)}")
     assert len(all_docs) > 0, "No PDFs or scrapped data provided"
     db = Chroma.from_documents(all_docs, embedding_function)
     return db
 def generate_rag(
     prompt: str,
     topic: str,
     model: str,
     url_content: dict,
     path: list[str],
@@ -238,18 +277,24 @@ def generate_rag(
     if llm is None:
         print("Failed to load LLM. Aborting operation.")
         return None
-    db = create_db_with_langchain(path, url_content)
-    retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K})
-    docs = retriever.get_relevant_documents(topic)
     formatted_docs = format_docs_xml(docs)
     rag_chain = RunnablePassthrough.assign(context=lambda _: formatted_docs) | xml_prompt | llm | XMLOutputParser()
     result = rag_chain.invoke({"input": prompt})
     citations = get_citations(result, docs)
-    db.delete_collection()  # delete othewise there could be duplicates because of the cache
     return result, citations
@@ -271,7 +316,9 @@ def generate_base(
 def generate(
     prompt: str,
     topic: str,
     model: str,
     url_content: dict,
     path: list[str],
@@ -281,6 +328,53 @@ def generate(
     sys_message="",
 ):
     if path or url_content:
-        return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
     else:
         return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)

+import gc
 import os
 from langchain_community.document_loaders import PyMuPDFLoader
 from langchain_core.documents import Document
 from langchain_core.output_parsers import XMLOutputParser
 from langchain.prompts import ChatPromptTemplate
 import re
+import numpy as np
+import torch
+# pip install bm25s
+import bm25s
+from langchain_community.cross_encoders import HuggingFaceCrossEncoder
+from langchain.retrievers import ContextualCompressionRetriever
+from langchain.retrievers.document_compressors import CrossEncoderReranker
+from langchain_core.messages import HumanMessage
 load_dotenv()
 CHUNK_SIZE = 1024
 CHUNK_OVERLAP = CHUNK_SIZE // 8
 K = 10
+FETCH_K = 50
+model_kwargs = {"device": "cuda:1"}
+print("Loading embedding and reranker models...")
+embedding_function = SentenceTransformerEmbeddings(
+    model_name="mixedbread-ai/mxbai-embed-large-v1", model_kwargs=model_kwargs
+)
+# "sentence-transformers/all-MiniLM-L6-v2"
+# "mixedbread-ai/mxbai-embed-large-v1"
+reranker = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base", model_kwargs=model_kwargs)
+compressor = CrossEncoderReranker(model=reranker, top_n=K)
 llm_model_translation = {
     "LLaMA 3": "llama3-70b-8192",
     return llm
+def create_db_with_langchain(path: list[str], url_content: dict, query: str):
     all_docs = []
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
     if path:
         for file in path:
             loader = PyMuPDFLoader(file)
             docs = text_splitter.split_documents([doc])
             all_docs.extend(docs)
+    print(f"### Total number of documents before bm25s: {len(all_docs)}")
+    # if the number of docs is too high, we need to reduce it
+    num_max_docs = 250
+    if len(all_docs) > num_max_docs:
+        docs_raw = [doc.page_content for doc in all_docs]
+        retriever = bm25s.BM25(corpus=docs_raw)
+        retriever.index(bm25s.tokenize(docs_raw))
+        results, scores = retriever.retrieve(bm25s.tokenize(query), k=len(docs_raw), sorted=False)
+        top_indices = np.argpartition(scores[0], -num_max_docs)[-num_max_docs:]
+        all_docs = [all_docs[i] for i in top_indices]
     # print docs
     for idx, doc in enumerate(all_docs):
         print(f"Doc: {idx} | Length = {len(doc.page_content)}")
     assert len(all_docs) > 0, "No PDFs or scrapped data provided"
     db = Chroma.from_documents(all_docs, embedding_function)
+    torch.cuda.empty_cache()
+    gc.collect()
     return db
+def pretty_print_docs(docs):
+    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))
 def generate_rag(
     prompt: str,
+    input_role: str,
     topic: str,
+    context: str,
     model: str,
     url_content: dict,
     path: list[str],
     if llm is None:
         print("Failed to load LLM. Aborting operation.")
         return None
+    query = llm_wrapper(input_role, topic, context, model="OpenAI GPT 4o", task_type="rag", temperature=0.7)
+    print("### Query: ", query)
+    db = create_db_with_langchain(path, url_content, query)
+    retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K, "lambda_mult": 0.75})
+    # docs = retriever.get_relevant_documents(query)
+    compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
+    docs = compression_retriever.invoke(query)
+    print(pretty_print_docs(docs))
     formatted_docs = format_docs_xml(docs)
     rag_chain = RunnablePassthrough.assign(context=lambda _: formatted_docs) | xml_prompt | llm | XMLOutputParser()
     result = rag_chain.invoke({"input": prompt})
     citations = get_citations(result, docs)
+    db.delete_collection()  # important, othwerwise it will keep the documents in memory
+    torch.cuda.empty_cache()
+    gc.collect()
     return result, citations
 def generate(
     prompt: str,
+    input_role: str,
     topic: str,
+    context: str,
     model: str,
     url_content: dict,
     path: list[str],
     sys_message="",
 ):
     if path or url_content:
+        return generate_rag(
+            prompt, input_role, topic, context, model, url_content, path, temperature, max_length, api_key, sys_message
+        )
     else:
         return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
+def llm_wrapper(
+    iam=None,
+    topic=None,
+    context=None,
+    temperature=1.0,
+    max_length=512,
+    api_key="",
+    model="OpenAI GPT 4o Mini",
+    task_type="internet",
+):
+    llm = load_llm(model, api_key, temperature, max_length)
+    if task_type == "rag":
+        system_message_content = """You are an AI assistant tasked with reformulating user inputs to improve retrieval query in a RAG system.
+- Given the original user inputs, construct query to be more specific, detailed, and likely to retrieve relevant information.
+- Generate the query as a complete sentence or question, not just as keywords, to ensure the retrieval process can find detailed and contextually relevant information.
+- You may enhance the query by adding related and relevant terms, but do not introduce new facts, such as dates, numbers, or assumed information, that were not provided in the input.
+**Inputs:**
+- **User Role**: {iam}
+- **Topic**: {topic}
+- **Context**: {context}
+**Only return the search query**."""
+    elif task_type == "internet":
+        system_message_content = """You are an AI assistant tasked with generating an optimized Google search query to help retrieve relevant websites, news, articles, and other sources of information.
+- You may enhance the query by adding related and relevant terms, but do not introduce new facts, such as dates, numbers, or assumed information, that were not provided in the input.
+- The query should be **concise** and include important **keywords** while incorporating **short phrases** or context where it improves the search.
+- Avoid the use of "site:" operators or narrowing search by specific websites.
+**Inputs:**
+- **User Role**: {iam}
+- **Topic**: {topic}
+- **Context**: {context}
+**Only return the search query**.
+"""
+    else:
+        raise ValueError("Task type not recognized. Please specify 'rag' or 'internet'.")
+    human_message = HumanMessage(content=system_message_content.format(iam=iam, topic=topic, context=context))
+    response = llm.invoke([human_message])
+    return response.content.strip('"').strip("'")

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ nohup python3 app.py &
 export GOOGLE_APPLICATION_CREDENTIALS="gcp_creds.json"
 """
 import re
 import uuid
 import json
@@ -23,13 +24,12 @@ if gr.NO_RELOAD:
     from humanize import humanize_text, device
     from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
     from google_search import google_search, months, domain_list, build_date
-    from ai_generate import generate, citations_to_html, remove_citations, display_cited_text
-if gr.NO_RELOAD:
     nltk.download("punkt_tab")
     print(f"Using device: {device}")
     models = {
         "Polygraf AI (Base Model)": AutoModelForSequenceClassification.from_pretrained(
             "polygraf-ai/bc-roberta-openai-2sent"
@@ -51,6 +51,7 @@ if gr.NO_RELOAD:
     TEXT_MC_MODEL_PATH = "polygraf-ai/mc-model"
     MC_LABEL_MAP = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "Grammar Enhancer"]
     text_mc_tokenizer = AutoTokenizer.from_pretrained(TEXT_MC_MODEL_PATH)
     text_mc_model = AutoModelForSequenceClassification.from_pretrained(TEXT_MC_MODEL_PATH).to(device)
@@ -64,20 +65,20 @@ def generate_cited_html(cited_text, citations: dict):
     }
     .reference-btn {
         display: inline-block;
-        width: 25px;
-        height: 25px;
         border-radius: 50%;
-        background-color: #0000EE; /* Blue color for the button */
         color: white;
         text-align: center;
-        line-height: 25px;
         cursor: pointer;
         font-weight: bold;
         margin-right: 5px;
         transition: background-color 0.3s ease, transform 0.3s ease;
     }
     .reference-btn:hover {
-        background-color: #1e90ff; /* Lighter blue on hover */
         transform: scale(1.1); /* Slightly enlarge on hover */
     }
     .reference-popup {
@@ -357,6 +358,8 @@ def predict(model, tokenizer, text):
         output = model(**tokens)
         output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
         output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]}
         return output_norm
@@ -428,6 +431,8 @@ def predict_mc(text):
         ).to(device)
         output = text_mc_model(**tokens)
         output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
         return output_norm
@@ -582,7 +587,9 @@ def generate_article(
     print("Generated Prompt...\n", prompt)
     article, citations = generate(
         prompt=prompt,
         topic=topic,
         model=ai_model,
         url_content=url_content,
         path=pdf_file_input,
@@ -692,7 +699,6 @@ def save_to_cloud_storage(
     article,
     topic,
     input_role,
-    topic_context,
     context,
     keywords,
     article_length,
@@ -725,7 +731,6 @@ def save_to_cloud_storage(
         "metadata": {
             "topic": topic,
             "input_role": input_role,
-            "topic_context": topic_context,
             "context": context,
             "keywords": keywords,
             "article_length": article_length,
@@ -818,7 +823,9 @@ def generate_and_format(
         date_from = build_date(year_from, month_from, day_from)
         date_to = build_date(year_to, month_to, day_to)
         sorted_date = f"date:r:{date_from}:{date_to}"
-        final_query = topic
         if include_sites:
             site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
             final_query += " " + " OR ".join(site_queries)
@@ -827,10 +834,10 @@ def generate_and_format(
             final_query += " " + " ".join(exclude_queries)
         print(f"Google Search Query: {final_query}")
         url_content = google_search(final_query, sorted_date, domains_to_include)
-    topic_context = topic + ", " + context
     article, citations = generate_article(
         input_role,
-        topic_context,
         context,
         keywords,
         article_length,
@@ -866,7 +873,6 @@ def generate_and_format(
             article,
             topic,
             input_role,
-            topic_context,
             context,
             keywords,
             article_length,

 export GOOGLE_APPLICATION_CREDENTIALS="gcp_creds.json"
 """
+import gc
 import re
 import uuid
 import json
     from humanize import humanize_text, device
     from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
     from google_search import google_search, months, domain_list, build_date
+    from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper
     nltk.download("punkt_tab")
     print(f"Using device: {device}")
+    print("Loading AI detection models...")
     models = {
         "Polygraf AI (Base Model)": AutoModelForSequenceClassification.from_pretrained(
             "polygraf-ai/bc-roberta-openai-2sent"
     TEXT_MC_MODEL_PATH = "polygraf-ai/mc-model"
     MC_LABEL_MAP = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "Grammar Enhancer"]
     text_mc_tokenizer = AutoTokenizer.from_pretrained(TEXT_MC_MODEL_PATH)
+    print("Loading Source detection model...")
     text_mc_model = AutoModelForSequenceClassification.from_pretrained(TEXT_MC_MODEL_PATH).to(device)
     }
     .reference-btn {
         display: inline-block;
+        width: 20px; /* Reduced width */
+        height: 20px; /* Reduced height */
         border-radius: 50%;
+        background-color: #e33a89; /* Pink color for the button */
         color: white;
         text-align: center;
+        line-height: 20px; /* Adjusted line-height */
         cursor: pointer;
         font-weight: bold;
         margin-right: 5px;
         transition: background-color 0.3s ease, transform 0.3s ease;
     }
     .reference-btn:hover {
+        background-color: #ff69b4; /* Lighter pink on hover */
         transform: scale(1.1); /* Slightly enlarge on hover */
     }
     .reference-popup {
         output = model(**tokens)
         output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
         output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]}
+        torch.cuda.empty_cache()
+        gc.collect()
         return output_norm
         ).to(device)
         output = text_mc_model(**tokens)
         output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
+        torch.cuda.empty_cache()
+        gc.collect()
         return output_norm
     print("Generated Prompt...\n", prompt)
     article, citations = generate(
         prompt=prompt,
+        input_role=input_role,
         topic=topic,
+        context=context,
         model=ai_model,
         url_content=url_content,
         path=pdf_file_input,
     article,
     topic,
     input_role,
     context,
     keywords,
     article_length,
         "metadata": {
             "topic": topic,
             "input_role": input_role,
             "context": context,
             "keywords": keywords,
             "article_length": article_length,
         date_from = build_date(year_from, month_from, day_from)
         date_to = build_date(year_to, month_to, day_to)
         sorted_date = f"date:r:{date_from}:{date_to}"
+        final_query = llm_wrapper(
+            input_role, topic, context, model="OpenAI GPT 4o", task_type="internet", temperature=0.7
+        )
         if include_sites:
             site_queries = [f"site:{site.strip()}" for site in include_sites.split(",")]
             final_query += " " + " OR ".join(site_queries)
             final_query += " " + " ".join(exclude_queries)
         print(f"Google Search Query: {final_query}")
         url_content = google_search(final_query, sorted_date, domains_to_include)
+    # topic_context = topic + ", " + context
     article, citations = generate_article(
         input_role,
+        topic,
         context,
         keywords,
         article_length,
             article,
             topic,
             input_role,
             context,
             keywords,
             article_length,