Spaces:

raannakasturi
/

ReXploreBackend

Running

App Files Files Community

raannakasturi commited on Dec 27, 2024

Commit

3ec5aa6

1 Parent(s): 41d4301

Add initial implementation for paper summarization and data fetching

Browse files

Files changed (5) hide show

.gitignore +4 -0
fetch_data.py +69 -0
main.py +51 -0
post_blog.py +156 -0
summarize_paper.py +17 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+.env
+/__*
+*.json
+/BLOGGER

fetch_data.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from gradio_client import Client
+import json
+from concurrent.futures import ThreadPoolExecutor, as_completed
+def fetch_category_ids(cat_ids_api_key):
+    """Fetch category IDs using the category API."""
+    if not cat_ids_api_key:
+        raise ValueError("API access key not found. Please check your environment variables.")
+    cat_ids_api_client = Client("raannakasturi/ReXploreIDFetchingAPI")
+    try:
+        result = cat_ids_api_client.predict(
+            user_access_key=cat_ids_api_key,
+            api_name="/fetch_paper_ids"
+        )
+        cat_ids = json.loads(result)
+        if cat_ids['status'] == 'success':
+            return cat_ids['data']
+        else:
+            return None
+    except Exception as e:
+        print(f"Exception while fetching category IDs: {str(e)}")
+        return None
+def fetch_single_paper_data(paper_id):
+    paper_data_api_client = Client("raannakasturi/ReXplorePaperDataFetcher")
+    try:
+        result = paper_data_api_client.predict(
+            id=paper_id,
+            api_name="/fetch_paper_data"
+        )
+        paper_data = json.loads(result)
+        if paper_data['status'] == 'success':
+            return paper_id, paper_data['data']
+        else:
+            print(f"Failed to fetch data for paper ID {paper_id}: {paper_data.get('message', 'Unknown error')}")
+            return paper_id, None
+    except Exception as e:
+        print(f"Exception while fetching data for paper ID {paper_id}: {str(e)}")
+        return paper_id, None
+def fetch_paper_data_concurrently(paper_ids, max_threads=12):
+    paper_id_data = {}
+    with ThreadPoolExecutor(max_workers=max_threads) as executor:
+        future_to_paper_id = {executor.submit(fetch_single_paper_data, paper_id): paper_id for paper_id in paper_ids}
+        for future in as_completed(future_to_paper_id):
+            paper_id = future_to_paper_id[future]
+            try:
+                paper_id, data = future.result()
+                if data:
+                    paper_id_data[paper_id] = data
+            except Exception as e:
+                print(f"Error fetching data for paper ID {paper_id}: {str(e)}")
+    return paper_id_data
+def fetch_paper_data_with_category(cat_ids_api_key):
+    data = {}
+    try:
+        cat_ids = fetch_category_ids(cat_ids_api_key)
+        if cat_ids:
+            for category, ids in cat_ids.items():
+                print(f"Fetching data for category: {category}")
+                paper_data = fetch_paper_data_concurrently(ids['ids'])
+                print(paper_data)
+                if paper_data:
+                    data[category] = paper_data
+        return json.dumps(data, indent=4, ensure_ascii=False)
+    except Exception as e:
+        print(f"Exception while fetching paper data by category: {str(e)}")
+        return None

main.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import json
+import os
+import dotenv
+from summarize_paper import summarize_paper
+from fetch_data import fetch_paper_data_with_category
+from post_blog import post_blog
+# Load environment variables
+dotenv.load_dotenv()
+cat_ids_api_key = os.getenv("DATA_API_ACCESS_KEY")
+summarizer_api_key = os.getenv("SUMMARIZER_API_KEY")
+def paper_data(paper_data):
+    data = {"status": "success"}
+    data['data'] = {}
+    paper_data = json.loads(paper_data)
+    for category, papers in paper_data.items():
+            print(f"Processing category: {category}")
+            data['data'][category] = {}
+            for paper_id, details in papers.items():
+                doi = details.get("doi")
+                pdf_url = details.get("pdf_url")
+                title = details.get("title")
+                citation = details.get("citation")
+                if not all([paper_id, doi, pdf_url, title, citation]):
+                    print(f"Skipping paper with ID: {paper_id} (missing details)")
+                    continue
+                summary, mindmap = summarize_paper(pdf_url, paper_id, summarizer_api_key)
+                post_blog(title, category, summary, mindmap, citation, os.getenv('ACCESS_KEY'))
+                data['data'][category][paper_id] = {
+                    "id": paper_id,
+                    "doi": doi,
+                    "title": title,
+                    "category": category,
+                    "citation": citation,
+                    "summary": summary,
+                    "mindmap": mindmap,
+                }
+    output_file = "paper_data_with_summary.json"
+    data = json.dumps(data, indent=4, ensure_ascii=False)
+    with open(output_file, "w", encoding="utf-8") as file:
+            json.dump(data, file, indent=4)
+    print(f"Processed data saved to {output_file}")
+    return data
+if __name__ == "__main__":
+    data = fetch_paper_data_with_category(cat_ids_api_key)
+    with open("paper_data.json", "w", encoding="utf-8") as file:
+        json.dump(data, file, indent=4, ensure_ascii=False)
+    pdata = paper_data(data)
+    print(pdata)

post_blog.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import re
+import os
+import requests
+import base64
+import dotenv
+import mistune
+dotenv.load_dotenv()
+api_endpoint = os.getenv('API_ENDPOINT')
+password = os.getenv('BLOG_PASSWORD')
+username = os.getenv('BLOG_USERNAME')
+def generate_post_html(title, summary, mindmap, citation):
+    html_summary = mistune.html(summary)
+    post = f"""
+<!-- wp:html -->
+<div>
+    <script src="https://cdn.jsdelivr.net/npm/markmap-autoloader@latest"></script>
+    <style>
+        .markmap {{
+            position: relative;
+        }}
+        .markmap > svg {{
+            width: 100%;
+            border: 2px solid #000;
+            height: 80dvh;
+        }}
+    </style>
+    <p id="paper_summary" data="{summary.replace("&amp;", "&")}">
+        {html_summary.replace("&amp;", "&")}
+    </p>
+    <br>
+    <br>
+    <h2>Mindmap</h2>
+    <div class="markmap" data="{mindmap.replace("&amp;", "&")}">
+        <script type="text/template">
+            # {title.replace("&amp;", "&")}
+            {mindmap.replace("&amp;", "&")}
+        </script>
+    </div>
+    <br>
+    <br>
+    <h2>Citation</h2>
+    <p id="paper_citation" data="{citation.replace("&amp;", "&")}">
+        {mistune.html(citation.replace("&amp;", "&"))}
+    </p>
+</div>
+<!-- /wp:html -->
+    """
+    return post
+def sanitize_citation(citation):
+    pattern = r"(https://doi\.org/\S+)"
+    sanitized_citation = re.sub(
+        pattern,
+        lambda match: f"[{match.group(1)}](https://doi.org/{match.group(1).split('/')[-1]})",
+        citation
+    )
+    return sanitized_citation
+def create_post(title, category, summary, mindmap, citation):
+    post_title = f"{title}"
+    post_category = f"{category}"
+    post_body = generate_post_html(title, summary, mindmap, sanitize_citation(citation))
+    return post_title, post_category, post_body
+def create_category_if_not_exists(category_name, headers):
+    categories_url = f"{api_endpoint}/categories"
+    response = requests.get(categories_url, headers=headers)
+    if response.status_code == 200:
+        categories = response.json()
+        for category in categories:
+            if category['name'].lower() == category_name.lower():
+                return category['id']
+    create_response = requests.post(
+        categories_url,
+        headers=headers,
+        json={"name": category_name}
+    )
+    if create_response.status_code == 201:
+        return create_response.json()['id']
+    else:
+        print(f"Error creating category: {create_response.text}")
+        return None
+def post_post(title, category, body):
+    credentials = f"{username}:{password}"
+    url = f"{api_endpoint}/posts"
+    auth_key = base64.b64encode(credentials.encode('utf-8')).decode('utf-8')
+    headers = {
+        "Authorization": f"Basic {auth_key}"
+    }
+    category_ids = []
+    category_id = create_category_if_not_exists(category, headers)
+    if category_id:
+        category_ids.append(category_id)
+    post_data = {
+        "title": title,
+        "status": "publish",
+        "categories": category_ids,
+        "content": body
+    }
+    response = requests.post(url, headers=headers, json=post_data)
+    print(response.status_code)
+    if response.status_code == 201:
+        print(f"Posted to blog... [{category}] {title}")
+        return True
+    else:
+        print(f"Failed to post to blog: {response.text}")
+        return False
+def post_blog(title, category, summary, mindmap, citation, access_key):
+    if access_key != os.getenv('ACCESS_KEY'):
+        return False
+    try:
+        post_title, post_category, post_body = create_post(title, category, summary, mindmap, citation)
+        status = post_post(post_title, post_category, post_body)
+        if status:
+            print('Post created successfully')
+            return True
+        else:
+            print('Failed to create post')
+            return False
+    except Exception as e:
+        print('An error occurred:', str(e))
+        return False
+if __name__ == '__main__':
+    data = {
+            "status": "success",
+            "Astrophysics": {
+                "2412.16344": {
+                    "id": "2412.16344",
+                    "doi": "https://doi.org/10.48550/arXiv.2412.16344",
+                    "title": "Focal Plane of the Arcus Probe X-Ray Spectrograph",
+                    "category": "Astrophysics",
+                    "citation": "Grant, C. E., Bautz, M. W., Miller, E. D., Foster, R. F., LaMarr, B., Malonis, A., Prigozhin, G., Schneider, B., Leitz, C., &amp; Falcone, A. D. (2024). Focal Plane of the Arcus Probe X-Ray Spectrograph. ArXiv. https://doi.org/10.48550/ARXIV.2412.16344",
+                    "summary": "## Summary\nThe Arcus Probe mission concept provides high-resolution soft X-ray and UV spectroscopy to study the universe. The X-ray Spectrograph (XRS) uses two CCD focal planes to detect and record X-ray photons. Laboratory performance results meet observatory requirements.\n\n## Highlights\n- The Arcus Probe mission concept explores the formation and evolution of clusters, galaxies, and stars.\n- The XRS instrument includes four parallel optical channels and two detector focal plane arrays.\n- The CCDs are designed and manufactured by MIT Lincoln Laboratory (MIT/LL).\n- The XRS focal plane utilizes high heritage MIT/LL CCDs with proven technologies.\n- Laboratory testing confirms CCID-94 performance meets required spectral resolution and readout noise.\n- The Arcus mission includes two co-aligned instruments working simultaneously.\n- The XRS Instrument Control Unit (XICU) controls the activities of the XRS.\n\n## Key Insights\n- The Arcus Probe mission concept provides a significant improvement in sensitivity and resolution over previous missions, enabling breakthrough science in understanding the universe.\n- The XRS instrument's design, including the use of two CCD focal planes and four parallel optical channels, allows for high-resolution spectroscopy and efficient detection of X-ray photons.\n- The CCDs used in the XRS instrument are designed and manufactured by MIT Lincoln Laboratory (MIT/LL), which has a proven track record of producing high-quality CCDs for space missions.\n- The laboratory performance results of the CCID-94 device demonstrate that it meets the required spectral resolution and readout noise for the Arcus mission, indicating that the instrument is capable of achieving its scientific goals.\n- The XRS Instrument Control Unit (XICU) plays a crucial role in controlling the activities of the XRS, including gathering and storing data, and processing event recognition.\n- The Arcus mission's use of two co-aligned instruments working simultaneously allows for a wide range of scientific investigations, including the study of time-domain science and the physics of time-dependent phenomena.\n- The high heritage MIT/LL CCDs used in the XRS focal plane provide a reliable and efficient means of detecting X-ray photons, enabling the instrument to achieve its scientific goals.",
+                    "mindmap": "## Arcus Probe Mission Concept\n- Explores formation and evolution of clusters, galaxies, stars\n- High-resolution soft X-ray and UV spectroscopy\n- Agile response capability for time-domain science\n\n## X-Ray Spectrograph (XRS) Instrument\n- Two nearly identical CCD focal planes\n- Detects and records X-ray photons from dispersed spectra\n- Zero-order of critical angle transmission gratings\n\n## XRS Focal Plane Characteristics\n- Frametransfer X-ray CCDs\n- 8-CCD array per Detector Assembly\n- FWHM < 70 eV @ 0.5 keV\n- System read noise ≤ 4 e- RMS @ 625 kpixels/sec\n\n## Detector Assembly\n- Eight CCDs in a linear array\n- Tilted to match curved focal surface\n- Gaps minimized between CCDs\n- Alignment optimized with XRS optics\n\n## Detector Electronics\n- Programmable analog clock waveforms and biases\n- Low-noise analog signal processing and digitization\n- 1 second frame time for negligible pileup\n\n## XRS Instrument Control Unit (XICU)\n- Controls XRS activities and data transfer\n- Event Recognition Processor (ERP) extracts X-ray events\n- Reduces data rate by many orders of magnitude\n\n## CCD X-Ray Performance\n- Measured readout noise 2-3 e- RMS\n- Spectral resolution meets Arcus requirements\n- FWHM < 70 eV at 0.5 keV\n\n## CCID-94 Characteristics\n- Back-illuminated frame-transfer CCDs\n- 2048 × 1024 pixel imaging array\n- 24 × 24 µm image area pixel size\n- 50 µm detector thickness\n\n## Contamination Blocking Filter (CBF)\n- Protects detectors from molecular contamination\n- 45 nm polyimide + 30 nm Al\n- Maintained above +20°C by heater control\n\n## Optical Blocking Filter (OBF)\n- Attenuates visible/IR stray light\n- 40 nm Al on-chip filter\n- Works in conjunction with CBF"
+                }
+            }
+        }
+    if data['status'] != 'success':
+        print('Failed to fetch data')
+    else:
+        for category, catdata in data.items():
+            if category != 'status':
+                for paper_id, paperdata in catdata.items():
+                    title = paperdata['title']
+                    category = paperdata['category']
+                    summary = paperdata['summary']
+                    mindmap = paperdata['mindmap']
+                    citation = paperdata['citation']
+                    access_key = os.getenv('ACCESS_KEY')
+                    post_blog(title, category, summary, mindmap, citation, access_key)

summarize_paper.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import json
+from gradio_client import Client
+def summarize_paper(pdf_url, paper_id, access_key):
+    mindmap = None
+    summary = None
+    summarizer_client = Client("raannakasturi/ReXploreAPI")
+    result = summarizer_client.predict(
+        url=pdf_url,
+        id=paper_id,
+        access_key=access_key,
+        api_name="/rexplore_summarizer"
+    )
+    data = json.loads(result[0])
+    mindmap = data.get('mindmap')
+    summary = data.get('summary')
+    return summary, mindmap