Spaces:

jpwahle
/

field-diversity

Sleeping

File size: 11,290 Bytes

# Copyright 2023 by Jan Philip Wahle, https://jpwahle.com/
# All rights reserved.
# Thanks to Mukund Rungta for inspiration on early versions of this demo https://huggingface.co/spaces/mrungta8/CitationalAmnesia


import asyncio

import gradio as gr

from aclanthology import determine_page_type
from plots import generate_cfdi_plot, generate_maoc_plot
from s2 import (check_s2_id_type, compute_stats_for_acl_author,
                compute_stats_for_acl_paper, compute_stats_for_acl_venue,
                compute_stats_for_pdf, compute_stats_for_s2_author,
                compute_stats_for_s2_paper)


def return_clear():
    """Clearing all demo inputs

    Returns:
        None
    """
    return None, None, None, None, None, None, None, None


def create_compute_stats(submit_type=None):
    def compute_stats(s2_id=None, pdf_file=None, acl_link=None):
        if submit_type == "s2_id" and s2_id:
            # Check if s2_id is a paper id or an author id
            id_type, author_name = check_s2_id_type(s2_id)
            if id_type == "paper":
                results = compute_stats_for_s2_paper(s2_id)
                results = results + ("paper",)
                return plot_and_return_stats(*results)
            if id_type == "author":
                results = compute_stats_for_s2_author(s2_id, author_name)
                results = results + ("author",)
                return plot_and_return_stats(*results)
        if submit_type == "acl_link" and acl_link:
            # Crawl all papers for the author or venue or just the paper if it is a paper link
            url_type = determine_page_type(acl_link)
            if url_type == "paper":
                results = compute_stats_for_acl_paper(acl_link)
                results = results + ("paper",)
                return plot_and_return_stats(*results)
            if url_type == "author":
                results = compute_stats_for_acl_author(acl_link)
                results = results + ("author",)
                return plot_and_return_stats(*results)
            if url_type == "venue":
                results = compute_stats_for_acl_venue(acl_link)
                results = results + ("proceedings",)
                return plot_and_return_stats(*results)
        if submit_type == "pdf_file" and pdf_file:
            # Compute the citation field diversity index and citation age diversity index
            results = asyncio.run(compute_stats_for_pdf(pdf_file))
            results = results + ("paper",)
            return plot_and_return_stats(*results)
        return None, None, None, None, None, None, None, None

    return compute_stats


def plot_and_return_stats(
    title_authors,
    num_references,
    field_counts,
    year_title_dict,
    cfdi,
    cadi,
    maoc,
    compute_type,
):
    """
    Plots the data and returns statistics.

    Args:
        title_authors (str): The title and authors of the paper.
        num_references (int): The number of references in the paper.
        field_counts (dict): A dictionary containing the count of each field.
        year_title_dict (dict): A dictionary containing the year and title of each paper.
        cfdi (list): A list of tuples containing the citation field and the number of papers in that field.
        cadi (list): A list of tuples containing the citation author and the number of papers by that author.
        maoc (list): A list of tuples containing the main author and the number of papers by that author.

    Returns:
        tuple: A tuple containing the title and authors of the paper, the number of references, the top 3 most cited fields,
        the most common oldest papers, the cfdi, cadi, and the plots for cfdi and maoc.
    """
    # Generate cfdi plot
    plot_cfdi = generate_cfdi_plot(cfdi, compute_type)

    # Generate cadi plot
    # plot_maoc = generate_maoc_plot(maoc, compute_type)

    # Get top 3 most cited fields
    top_fields_text = "\n".join(
        [
            f"{field}: {count}"
            for field, count in sorted(
                field_counts.items(), reverse=True, key=lambda x: x[1]
            )[:3]
        ]
    )
    
    cfdi = round(cfdi, 3)

    # Get most common oldest papers
    # oldest_paper_text = "".join(
    #     f"[{str(year)}] {title}" + "\n"
    #     for year, title in sorted(year_title_dict.items())[:3]
    # )

    return (
        title_authors,
        num_references,
        top_fields_text,
        # oldest_paper_text,
        cfdi,
        # cadi,
        plot_cfdi,
        # plot_maoc,
    )


with gr.Blocks(
    theme=gr.themes.Soft()
) as demo:
    with gr.Row():
        gr.Markdown(
            """
            # Citation Field Diversity Calculator

            Welcome to this interactive demo to analyze the field diversity aspect of your citational practice. This tool will enable you to reflect on a critical aspect:

            - By whom am I influenced? Which fields heavily inform and shape the research trajectory of my works?

            In addition, you will be able to analyze how the above compares to the average paper or author. The results you will receive cannot be categorized into “good” or “bad”. Instead, they are meant to raise self-awareness about one’s citational diversity and reflect on it. The results might bring you to further questions, such as:

            - Am I reading widely across fields?
            - Should I expand my literature search to include works from other fields?

            Using citations as a tangible marker of influence, our demo provides empirical insights into the influence of papers across fields.

            ## What is Citation Field Diversity?

            Field diversity is a measure of the variety of research fields that a paper or an author draws upon. A high field diversity indicates that the work draws from various distinct research fields, demonstrating a multidisciplinary influence on that work or author.
            
            ## What is the Citation Field Diversity Index (CFDI) and how is it calculated?

            The calculation of Field Diversity involves extracting all the references of a paper, categorizing them into distinct study fields, and determining the proportion of each study field over all the references. The Citation Field Diversity Index (CFDI) is then computed by applying the Gini Index on these proportions.

            For more details, please refer to Eq. 3 in [this paper](https://aclanthology.org/2023.acl-long.341/).
            """
        )

        gr.Markdown(
            """
            ## How do I Interpret CFDI?

            Higher values of CFDI indicate a greater diversity of a paper in terms of the fields it cites, signifying a multidisciplinary influence. On the other hand, lower values signify a lower diversity, indicating that citations are more concentrated in specific fields.

            ## How can I use this demo?

            There are three ways for you to compute the field diversity for papers:
            1. **Semantic Scholar ID**: Enter the Semantic Scholar ID of a **paper** or **author** and click the *"Compute"* button.
            2. **ACL Anthology Link**: Paste the ACL Anthology link of a **paper**, **venue**, or **author** and click the *"Compute"* button.
            3. **PDF File**: Upload your **paper** PDF and click the *"Compute"* button.

            To retrieve the **Semantic Scholar ID** for a paper such as "The Elephant in the Room: Analyzing the Presence of Big Tech in Natural Language Processing Research," search the paper on Semantic Scholar [here](https://www.semanticscholar.org/paper/The-Elephant-in-the-Room%3A-Analyzing-the-Presence-of-Abdalla-Wahle/587ffdfd7229e8e0dbc5250b44df5fad6251f6ad) and use the last part of the URL. The Semantic Scholar ID (SSID) for this paper is: **587ffdfd7229e8e0dbc5250b44df5fad6251f6ad**.

            To get an ACL Anthology link, you can go to any ACL Anthology paper, author or proceedings page and just copy and paste the url. For example:
            - https://aclanthology.org/2023.acl-long.1/
            - https://aclanthology.org/people/a/anna-rogers/
            - https://aclanthology.org/events/acl-2002/
            """
        )

    with gr.Row():
        with gr.Tabs():
            with gr.TabItem("Semantic Scholar ID"):
                s2_id = gr.Textbox(
                    label="Semantic Scholar ID",
                    placeholder=(
                        "Enter the Semantic Scholar ID here and press enter..."
                    ),
                    # value="587ffdfd7229e8e0dbc5250b44df5fad6251f6ad",
                )
                with gr.Row():
                    s2_submit_btn = gr.Button("Compute")
            with gr.TabItem("ACL Anthology Link"):
                acl_link = gr.Textbox(
                    label="ACL Anthology Link",
                    placeholder="Paste the ACL Anthology link here...",
                )
                with gr.Row():
                    acl_submit_btn = gr.Button("Compute")
            with gr.TabItem("PDF File"):
                pdf_file = gr.File(
                    file_types=[".pdf"], label="Upload your paper PDF"
                )
                with gr.Row():
                    file_submit_btn = gr.Button("Compute")
    with gr.Row():
        title = gr.Textbox(
            label="Title / Author Name / Venue Name:", lines=2
        )  # Can be either paper title, author name, or proceedings title
    with gr.Row():
        num_ref = gr.Textbox(label="Number of references", lines=3)
        top_field_list = gr.Textbox(label="Top 3 fields cited:", lines=3)
        # top_age_list = gr.Textbox(label="Top 3 oldest papers cited:", lines=3)
    with gr.Row():
        cfdi = gr.Textbox(label="CFDI")
        # cadi = gr.Textbox(label="CADI")
    with gr.Row():
        cfdi_plot = gr.Plot(label="Citation Field Diversity")
        # cadi_plot = gr.Plot(label="Citation Age Diversity")
    with gr.Row():
        clear_btn = gr.Button("Clear")

    submit_args = dict(
        inputs=[s2_id, pdf_file, acl_link],
        outputs=[
            title,
            num_ref,
            top_field_list,
            # top_age_list,
            cfdi,
            # cadi,
            cfdi_plot,
            # cadi_plot,
        ],
    )

    s2_submit_args = submit_args.copy()
    s2_submit_args["fn"] = create_compute_stats(submit_type="s2_id")

    acl_submit_args = submit_args.copy()
    acl_submit_args["fn"] = create_compute_stats(submit_type="acl_link")

    file_submit_args = submit_args.copy()
    file_submit_args["fn"] = create_compute_stats(submit_type="pdf_file")

    s2_id.submit(**s2_submit_args)
    acl_link.submit(**acl_submit_args)

    acl_submit_btn.click(**acl_submit_args)
    s2_submit_btn.click(**s2_submit_args)
    file_submit_btn.click(**file_submit_args)

    clear_btn.click(
        fn=return_clear,
        inputs=[],
        outputs=[
            title,
            num_ref,
            top_field_list,
            # top_age_list,
            cfdi,
            # cadi,
            cfdi_plot,
            # cadi_plot,
            s2_id,
            acl_link,
            pdf_file,
        ],
    )

demo.queue(concurrency_count=3)
demo.launch(server_port=7860, server_name="0.0.0.0")