openreviewer commited on
Commit
0bf9463
·
verified ·
1 Parent(s): 725cdd2

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,16 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # HIDE ALL OF THE FILES IN THE DIRECTORY
2
+ *.py
3
+ *.log
4
+ *.md
5
+ *.txt
6
+ iclr2024/**
7
+ *.github/**
8
+ *.gitignore
9
+ *.gitattributes
10
+ *.git/**
11
+ *.__pycache__/**
12
+
13
+
14
+
15
+
16
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.github/workflows/deploy.yml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Deploy Gradio App
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ deploy:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout code
14
+ uses: actions/checkout@v3
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v4
18
+ with:
19
+ python-version: '3.12.3' # Specify the Python version you are using
20
+
21
+ - name: Install dependencies
22
+ run: |
23
+ python -m pip install --upgrade pip
24
+ pip install -r requirements.txt # Ensure you have a requirements.txt file
25
+
26
+ - name: Login to Hugging Face
27
+ env:
28
+ HUGGINGFACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
29
+ run: |
30
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
31
+
32
+ - name: Deploy Gradio App
33
+ env:
34
+ HUGGINGFACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
35
+ run: gradio deploy
36
+ # - name: Upload to Hugging Face Spaces
37
+ # env:
38
+ # HF_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
39
+ # run: |
40
+ # git lfs install
41
+ # huggingface-cli lfs-enable-largefiles .
42
+ # huggingface-cli repo create reviewerarena/reviewer-arena --type=space
43
+ # huggingface-cli repo upload reviewerarena/reviewer-arena . --all-yes
44
+ # - name: Login to Hugging Face
45
+ # env:
46
+ # HUGGINGFACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
47
+ # run: |
48
+ # echo "$HUGGINGFACE_TOKEN" | huggingface-cli login --token
49
+
50
+ # - name: Deploy Gradio App
51
+ # env:
52
+ # HUGGINGFACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
53
+ # run: |
54
+ # gradio deploy --token $HUGGINGFACE_TOKEN
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ my-venv/
2
+ old/
3
+ arena.log
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Reviewer Arena
3
- emoji: 👁
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 4.31.3
8
  app_file: app.py
9
- pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: reviewer-arena
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 4.31.0
6
  ---
 
 
app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from utils import process_paper
3
+ import os
4
+ import logging
5
+ import html
6
+ from logging_config import setup_logging
7
+
8
+
9
+ setup_logging() # Ensure logging is initialized
10
+ # Define global variables for directories and API keys
11
+ paper_dir = 'path_to_temp_storage'
12
+ prompt_dir = 'iclr2024'
13
+ api_keys = {
14
+ 'openai_api_key': os.environ.get('openai_api_key'),
15
+ 'claude_api_key': os.environ.get('anthropic_api_key'),
16
+ 'gemini_api_key': os.environ.get('google_api_key'),
17
+ 'commandr_api_key': os.environ.get('cohere_api_key')
18
+ }
19
+
20
+ # Configure whether to use real API or not
21
+ use_real_api = False # Set this to True to use real APIs, False to use dummy data
22
+
23
+ def review_papers(pdf_file):
24
+ logging.info(f"Received file type: {type(pdf_file)}")
25
+ if use_real_api:
26
+ reviews = process_paper(pdf_file, paper_dir, prompt_dir, api_keys)
27
+ processed_reviews = []
28
+ for review in reviews:
29
+ processed_review = {}
30
+ for section in review:
31
+ if ':' in section: # Ensure there is a colon to split on
32
+ key, value = section.split(':', 1) # Split on the first colon only
33
+ # Replace newline characters with <br> for HTML line breaks
34
+ processed_value = value.strip().replace('\n', '<br>')
35
+ processed_review[key.strip()] = html.escape(processed_value) # Ensure HTML escaping
36
+ processed_reviews.append(processed_review)
37
+ reviews = processed_reviews
38
+ else:
39
+ # Dummy reviews for testing with structured sections
40
+ reviews = [
41
+ {
42
+ "Summary": "This is a placeholder review for Model 1. The paper explores advanced methodologies in reinforcement learning applied to autonomous driving systems, proposing significant enhancements to decision-making algorithms that could improve safety and operational efficiency. The authors provide a detailed analysis of the current limitations of existing systems and suggest innovative solutions that could transform the field.",
43
+ "Soundness": "The assumptions underlying the proposed enhancements are occasionally not fully justified, particularly concerning the scalability of the algorithms under varied and unpredictable traffic conditions. A more rigorous examination of these assumptions is necessary to solidify the paper's foundation.",
44
+ "Presentation": "While the paper is structured adequately, some sections delve into technical details that are not sufficiently elucidated for a broader audience. This could potentially limit the paper's impact and accessibility, making it challenging for non-specialists to fully grasp the implications of the research.",
45
+ "Contribution": "The paper makes a moderate contribution to the existing body of knowledge, offering incremental improvements over current methodologies rather than a completely novel approach. However, these improvements are significant and could lead to better practical implementations in the field of autonomous driving.",
46
+ "Strengths": "The initial results presented in the paper are promising, showing potential for the proposed methods. The inclusion of real-world data in the preliminary experiments adds a layer of credibility and relevance to the results, showcasing the practical applicability of the research.",
47
+ "Weaknesses": "The paper lacks detailed exposition on the methodology, particularly in how the algorithms adapt to unexpected or novel scenarios. This is a critical area that requires further development and testing to ensure the robustness and reliability of the proposed solutions.",
48
+ "Questions/Suggestions": "The statistical analysis section could be enhanced by incorporating more robust statistical techniques and a wider array of metrics. Additionally, conducting tests in a variety of driving environments could help in substantiating the claims made and strengthen the overall findings of the research.",
49
+ "Ethics Review": "The research complies with all ethical standards, addressing potential ethical issues related to autonomous driving comprehensively. Issues such as privacy concerns, decision-making in critical situations, and the overall impact on societal norms are discussed and handled with the utmost care.",
50
+ "Overall Score": "3/5",
51
+ "Confidence": "Confidence in the findings is moderate. While the initial results are encouraging, the limited scope of testing and some unresolved questions regarding scalability and robustness temper the confidence in these results.",
52
+ "Code of Conduct": "There are no violations of the code of conduct noted. The research upholds ethical standards and maintains transparency in methodologies and data usage, contributing to its integrity and the trustworthiness of the findings."
53
+ },
54
+ {
55
+ "Summary": "This is a placeholder review for Model 2. The paper explores advanced methodologies in reinforcement learning applied to autonomous driving systems, proposing significant enhancements to decision-making algorithms that could improve safety and operational efficiency. The authors provide a detailed analysis of the current limitations of existing systems and suggest innovative solutions that could transform the field.",
56
+ "Soundness": "The assumptions underlying the proposed enhancements are occasionally not fully justified, particularly concerning the scalability of the algorithms under varied and unpredictable traffic conditions. A more rigorous examination of these assumptions is necessary to solidify the paper's foundation.",
57
+ "Presentation": "While the paper is structured adequately, some sections delve into technical details that are not sufficiently elucidated for a broader audience. This could potentially limit the paper's impact and accessibility, making it challenging for non-specialists to fully grasp the implications of the research.",
58
+ "Contribution": "The paper makes a moderate contribution to the existing body of knowledge, offering incremental improvements over current methodologies rather than a completely novel approach. However, these improvements are significant and could lead to better practical implementations in the field of autonomous driving.",
59
+ "Strengths": "The initial results presented in the paper are promising, showing potential for the proposed methods. The inclusion of real-world data in the preliminary experiments adds a layer of credibility and relevance to the results, showcasing the practical applicability of the research.",
60
+ "Weaknesses": "The paper lacks detailed exposition on the methodology, particularly in how the algorithms adapt to unexpected or novel scenarios. This is a critical area that requires further development and testing to ensure the robustness and reliability of the proposed solutions.",
61
+ "Questions/Suggestions": "The statistical analysis section could be enhanced by incorporating more robust statistical techniques and a wider array of metrics. Additionally, conducting tests in a variety of driving environments could help in substantiating the claims made and strengthen the overall findings of the research.",
62
+ "Ethics Review": "The research complies with all ethical standards, addressing potential ethical issues related to autonomous driving comprehensively. Issues such as privacy concerns, decision-making in critical situations, and the overall impact on societal norms are discussed and handled with the utmost care.",
63
+ "Overall Score": "3/5",
64
+ "Confidence": "Confidence in the findings is moderate. While the initial results are encouraging, the limited scope of testing and some unresolved questions regarding scalability and robustness temper the confidence in these results.",
65
+ "Code of Conduct": "There are no violations of the code of conduct noted. The research upholds ethical standards and maintains transparency in methodologies and data usage, contributing to its integrity and the trustworthiness of the findings."
66
+ }
67
+ ]
68
+ processed_reviews = []
69
+ for review in reviews:
70
+ processed_review = {}
71
+ for key, value in review.items():
72
+ # Replace newline characters with <br> for HTML line breaks and escape HTML
73
+ processed_value = value.strip().replace('\n', '<br>')
74
+ processed_review[key.strip()] = html.escape(processed_value) # Ensure HTML escaping
75
+ processed_reviews.append(processed_review)
76
+ reviews = processed_reviews
77
+
78
+ review_texts = []
79
+ for review in reviews:
80
+ formatted_review = "<div class='review-container'>"
81
+ for section, content in review.items():
82
+ formatted_review += f"<div class='review-section'><strong>{section}:</strong> <span>{html.unescape(content)}</span></div>"
83
+ formatted_review += "</div>"
84
+ review_texts.append(formatted_review)
85
+ logging.debug(f"Final formatted reviews: {review_texts}")
86
+ return review_texts
87
+
88
+ def setup_interface():
89
+ logging.debug("Setting up Gradio interface.")
90
+ css = """
91
+ .review-container {
92
+ padding: 10px;
93
+ margin-bottom: 20px;
94
+ border: 1px solid #ccc;
95
+ background-color: #f9f9f9;
96
+ }
97
+ .review-section {
98
+ margin-bottom: 12px;
99
+ padding: 8px;
100
+ background-color: #ffffff;
101
+ border-left: 4px solid #007BFF;
102
+ padding-left: 10px;
103
+ }
104
+ .review-section strong {
105
+ color: #333;
106
+ font-weight: bold;
107
+ display: block;
108
+ margin-bottom: 5px;
109
+ }
110
+ .review-section span, .gr-markdown {
111
+ color: #000;
112
+ font-size: 14px;
113
+ line-height: 1.5;
114
+ display: block;
115
+ white-space: normal;
116
+ opacity: 1;
117
+ }
118
+ .model-label {
119
+ font-size: 18px;
120
+ font-weight: bold;
121
+ color: #007BFF;
122
+ margin-bottom: 10px;
123
+ }
124
+ .gr-file, .gr-button, .gr-radio {
125
+ width: 300px;
126
+ margin: auto;
127
+ }
128
+ """
129
+ with gr.Blocks(css=css) as demo:
130
+ gr.Markdown("## Reviewer Arena")
131
+ gr.Markdown("Upload an academic paper to get reviews from two randomly selected LLMs.")
132
+ with gr.Row():
133
+ file_input = gr.File(label="Upload Academic Paper")
134
+ submit_button = gr.Button("Submit!!")
135
+ with gr.Row():
136
+ with gr.Column():
137
+ gr.HTML("<div class='model-label'>Model A</div>")
138
+ review1 = gr.Markdown()
139
+ with gr.Column():
140
+ gr.HTML("<div class='model-label'>Model B</div>")
141
+ review2 = gr.Markdown()
142
+
143
+ # Voting options
144
+ vote_options = ["👍 A is better", "👍 B is better", "👔 Tie", "👎 Both are bad"]
145
+ vote = gr.Radio(label="Vote on the best model", choices=vote_options, value="Tie")
146
+ vote_button = gr.Button("Submit Vote")
147
+
148
+ def handle_vote(vote):
149
+ print(f"Vote received: {vote}")
150
+ return f"Vote for '{vote}' received!"
151
+
152
+ vote_button.click(fn=handle_vote, inputs=vote, outputs=gr.Textbox(visible=False))
153
+
154
+ submit_button.click(
155
+ fn=review_papers,
156
+ inputs=[file_input],
157
+ outputs=[review1, review2]
158
+ )
159
+ logging.debug("Gradio interface setup complete.")
160
+ return demo
161
+
162
+ if __name__ == "__main__":
163
+ logging.basicConfig(level=logging.INFO)
164
+ demo = setup_interface()
165
+ # BLOCK PATHS OF ALL THE FILES AND LAUNCH THE APP
166
+
167
+ # demo.launch(auth=(os.environ.get('login_username'), os.environ.get('login_password')), share=True)
168
+ demo.launch()
file_utils.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ def read_file(file_path):
2
+ with open(file_path, 'r', encoding='utf-8') as f:
3
+ return f.read()
iclr2024/.DS_Store ADDED
Binary file (6.15 kB). View file
 
iclr2024/question1.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Briefly summarize the paper and its contributions. This is not the place to critique the paper; the authors should generally agree with a well-written summary.
iclr2024/question10.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Please provide a 'confidence score' for your assessment of this submission to indicate how confident you are in your evaluation. 5: You are absolutely certain about your assessment. You are very familiar with the related work and checked the math/other details carefully. 4: You are confident in your assessment, but not absolutely certain. It is unlikely, but not impossible, that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work. 3: You are fairly confident in your assessment. It is possible that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked. 2: You are willing to defend your assessment, but it is quite likely that you did not understand the central parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked. 1: Your assessment is an educated guess. The submission is not in your area or the submission was difficult to understand. Math/other details were not carefully checked.
iclr2024/question11.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ If there are no violations of the Code of Conduct with this paper, please respond with NO. Otherwise, if this paper violates the Code of Conduct, please indicate the relevant section(s) from the following options:
2
+
3
+ Yes, Harassment, bullying, or discrimination based on personal characteristics
4
+ Yes, Inappropriate physical contact, sexual harassment, or unwelcome sexual attention
5
+ Yes, Offensive comments related to gender, race, religion, or other protected characteristics
6
+ Yes, Disruption of talks or other events, or behavior interfering with participation
7
+ Yes, Inappropriate use of imagery, language, or personal attacks in virtual interactions
iclr2024/question2.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Please assign the paper a numerical rating on the following scale to indicate the soundness of the technical claims, experimental and research methodology and on whether the central claims of the paper are adequately supported with evidence: 4 excellent, 3 good, 2 fair, 1 poor.
iclr2024/question3.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Please assign the paper a numerical rating on the following scale to indicate the quality of the presentation. This should take into account the writing style and clarity, as well as contextualization relative to prior work: 4 excellent, 3 good, 2 fair, 1 poor.
iclr2024/question4.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Please assign the paper a numerical rating on the following scale to indicate the quality of the overall contribution this paper makes to the research area being studied. Are the questions being asked important? Does the paper bring a significant originality of ideas and/or execution? Are the results valuable to share with the broader NeurIPS community? 4 excellent, 3 good, 2 fair, 1 poor.
iclr2024/question5.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Please give a substantive assessment of the strengths of the paper, touching on each of the following dimensions: originality, quality, clarity, and significance. We encourage reviewers to be broad in their definitions of originality and significance. For example, originality may arise from a new definition or problem formulation, creative combinations of existing ideas, application to a new domain, or removing limitations from prior results. You can incorporate Markdown and Latex into your review.
iclr2024/question6.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Please give a substantive assessment of the weaknesses of the paper. Focus on constructive and actionable insights on how the work could improve towards its stated goals. Be specific, avoid generic remarks. For example, if you believe the contribution lacks novelty, provide references and an explanation as evidence; if you believe experiments are insufficient, explain why and exactly what is missing, etc.
iclr2024/question7.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Please list and carefully describe any questions and suggestions for the authors. Think of the things where a response from the author can change your opinion, clarify a confusion or address a limitation. This is important for a productive rebuttal and discussion phase with the authors.
iclr2024/question8.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ If there are ethical issues with this paper, please flag the paper for an ethics review and select area of expertise that would be most useful for the ethics reviewer to have: No ethics review needed, Ethics review needed: Discrimination / Bias / Fairness Concerns, Ethics review needed: Inadequate Data and Algorithm Evaluation, Ethics review needed: Inappropriate Potential Applications & Impact (e.g., human rights concerns), Ethics review needed: Privacy and Security (e.g., consent, surveillance, data storage concern), Ethics review needed: Compliance (e.g., GDPR, copyright, license, terms of use), Ethics review needed: Research Integrity Issues (e.g., plagiarism), Ethics review needed: Responsible Research Practice (e.g., IRB, documentation, research ethics), Ethics review needed: Failure to comply with NeurIPS Code of Ethics (lack of required documentation, safeguards, disclosure, licenses, legal compliance)
iclr2024/question9.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Please provide an 'overall score' for this submission: 10: Award quality: Technically flawless paper with groundbreaking impact, with exceptionally strong evaluation, reproducibility, and resources, and no unaddressed ethical considerations. 9: Very Strong Accept: Technically flawless paper with groundbreaking impact on at least one area of AI/ML and excellent impact on multiple areas of AI/ML, with flawless evaluation, resources, and reproducibility, and no unaddressed ethical considerations. 8: Strong Accept: Technically strong paper, with novel ideas, excellent impact on at least one area, or high-to-excellent impact on multiple areas, with excellent evaluation, resources, and reproducibility, and no unaddressed ethical considerations. 7: Accept: Technically solid paper, with high impact on at least one sub-area, or moderate-to-high impact on more than one areas, with good-to-excellent evaluation, resources, reproducibility, and no unaddressed ethical considerations. 6: Weak Accept: Technically solid, moderate-to-high impact paper, with no major concerns with respect to evaluation, resources, reproducibility, ethical considerations. 5: Borderline accept: Technically solid paper where reasons to accept outweigh reasons to reject, e.g., limited evaluation. Please use sparingly. 4: Borderline reject: Technically solid paper where reasons to reject, e.g., limited evaluation, outweigh reasons to accept, e.g., good evaluation. Please use sparingly. 3: Reject: For instance, a paper with technical flaws, weak evaluation, inadequate reproducibility and incompletely addressed ethical considerations. 2: Strong Reject: For instance, a paper with major technical flaws, and/or poor evaluation, limited impact, poor reproducibility and mostly unaddressed ethical considerations. 1: Very Strong Reject: For instance, a paper with trivial results or unaddressed ethical considerations.
iclr2024/systemrole.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are a very critical but fair peer reviewer. You will be provided with papers submitted to a conference/journal to review. The papers will be delimited with #### characters.
2
+
3
+ We are aiming for a 20-25% acceptance rate. Average score thresholds of 5.5-5.7 roughly correspond to acceptance rates of 25%-20%. It is certainly possible to both accept papers below this threshold and reject papers above it. But any such decision should be properly explained.
4
+
5
+ The statistics for the previous year was: A total of 3422 submissions were received. The average score of all submissions was 5.47 with standard deviation 1.30, with scores ranging from 1.00 to 9.00. Aim for a similar distribution of scores and use the full range of scores between 1-10.
6
+
7
+ Out of all submissions, 32% (1095 submissions) were accepted, with scores ranging from 4.50 to 9.00 and an average score of 6.61 with a standard deviation of 0.75. Only 2.1% (55 submissions) were accepted for oral presentation, with scores ranging from 5.00 to 9.00 and an average score of 7.80 with a standard deviation of 0.63.
8
+
9
+ 6.64% (174 submissions) were selected for the spotlight, with scores ranging from 5.60 to 8.60 and an average score of 7.33 with a standard deviation of 0.58. 33.04% (866 submissions) were accepted for poster presentation, with scores ranging from 4.50 to 8.00 and an average score of 6.39 with a standard deviation of 0.61.
10
+
11
+ 60.36% (1582 submissions) were rejected, with scores ranging from 1.00 to 7.50 and an average score of 4.69 with a standard deviation of 0.97. Additionally, 775 submissions were withdrawn and 26 were desk rejected.
logging_config.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ def setup_logging():
4
+ logging.basicConfig(
5
+ filename="arena.log",
6
+ level=logging.DEBUG, # Change to DEBUG level
7
+ format='%(asctime)s - %(levelname)s - %(message)s'
8
+ )
9
+ logging.info("Logging setup complete.")
models.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import openai
4
+ import tiktoken
5
+ import re
6
+ import anthropic
7
+ import cohere
8
+ import google.generativeai as genai
9
+ import time
10
+ from file_utils import read_file
11
+ from openai import OpenAI
12
+
13
+ class Paper:
14
+ def __init__(self, arxiv_id, tex_file):
15
+ self.arxiv_id = arxiv_id
16
+ self.tex_file = tex_file
17
+
18
+ class PaperProcessor:
19
+ MAX_TOKENS = 127192
20
+ encoding = tiktoken.encoding_for_model("gpt-4-0125-preview")
21
+
22
+ def __init__(self, prompt_dir, model, openai_api_key, claude_api_key, gemini_api_key, commandr_api_key):
23
+ self.prompt_dir = prompt_dir
24
+ self.model = model
25
+ self.openai_api_key = openai_api_key
26
+ self.claude_api_key = claude_api_key
27
+ self.gemini_api_key = gemini_api_key
28
+ self.commandr_api_key = commandr_api_key
29
+
30
+ def count_tokens(self, text):
31
+ return len(self.encoding.encode(text))
32
+
33
+ def truncate_content(self, content):
34
+ token_count = self.count_tokens(content)
35
+ logging.debug(f"Token count before truncation: {token_count}")
36
+ if token_count > self.MAX_TOKENS:
37
+ tokens = self.encoding.encode(content)
38
+ truncated_tokens = tokens[:self.MAX_TOKENS]
39
+ truncated_content = self.encoding.decode(truncated_tokens)
40
+ logging.debug(f"Content truncated. Token count after truncation: {self.count_tokens(truncated_content)}")
41
+ return truncated_content
42
+ return content
43
+
44
+ def prepare_base_prompt(self, paper):
45
+ return paper.tex_file
46
+
47
+ def call_model(self, prompt, model_type):
48
+ system_role_file_path = os.path.join(self.prompt_dir, "systemrole.txt")
49
+ if not os.path.exists(system_role_file_path):
50
+ logging.error(f"System role file not found: {system_role_file_path}")
51
+ return None
52
+
53
+ system_role = read_file(system_role_file_path)
54
+ logging.debug(f"Token count of full prompt: {self.count_tokens(prompt)}")
55
+ logging.info(f"Sending the following prompt to {model_type}: {prompt}")
56
+
57
+ try:
58
+ if model_type == 'gpt':
59
+ client = OpenAI(api_key=self.openai_api_key)
60
+ messages = [{"role": "system", "content": system_role}, {"role": "user", "content": prompt}]
61
+ completion = client.chat.completions.create(
62
+ model="gpt-4-turbo-2024-04-09",
63
+ messages=messages,
64
+ temperature=1
65
+ )
66
+ return completion.choices[0].message.content.strip()
67
+
68
+ elif model_type == 'claude':
69
+ client = anthropic.Anthropic(api_key=self.claude_api_key)
70
+ response = client.messages.create(
71
+ model='claude-3-opus-20240229',
72
+ max_tokens=4096,
73
+ system=system_role,
74
+ temperature=0.5,
75
+ messages=[{"role": "user", "content": prompt}]
76
+ )
77
+ return response.content[0].text
78
+
79
+ elif model_type == 'commandr':
80
+ co = cohere.Client(self.commandr_api_key)
81
+ response = co.chat(
82
+ model="command-r-plus",
83
+ message=prompt,
84
+ preamble=system_role
85
+ )
86
+ return response.text
87
+
88
+ elif model_type == 'gemini':
89
+ genai.configure(api_key=self.gemini_api_key)
90
+ model = genai.GenerativeModel('gemini-pro')
91
+ response = model.generate_content(prompt)
92
+ return response.candidates[0].content.parts[0].text
93
+
94
+ except Exception as e:
95
+ logging.error(f"Exception occurred: {e}")
96
+ return None
97
+
98
+ def is_content_appropriate(self, content):
99
+ try:
100
+ response = openai.moderations.create(input=content)
101
+ return not response["results"][0]["flagged"]
102
+ except Exception as e:
103
+ logging.error(f"Exception occurred while checking content appropriateness: {e}")
104
+ return True # In case of an error, default to content being appropriate
105
+
106
+ def get_prompt_files(self, prompt_dir):
107
+ return [f for f in os.listdir(prompt_dir) if f.endswith('.txt') and f.startswith('question')]
108
+
109
+ def process_paper(self, paper):
110
+ openai.api_key = self.openai_api_key
111
+ start_time = time.time()
112
+
113
+ base_prompt = self.prepare_base_prompt(paper)
114
+ if base_prompt is None:
115
+ return "Error: Base prompt could not be prepared."
116
+
117
+ moderation_response = openai.moderations.create(input=base_prompt)
118
+ if moderation_response.results[0].flagged:
119
+ return ["Desk Rejected", "The paper contains inappropriate or harmful content."]
120
+
121
+ review_output = []
122
+ previous_responses = []
123
+ header = ['Summary:', 'Soundness:', 'Presentation:', 'Contribution:', 'Strengths:', 'Weaknesses:', 'Questions:', 'Flag For Ethics Review:', 'Rating:', 'Confidence:', 'Code Of Conduct:']
124
+ for i in range(1, 12):
125
+ question_file = os.path.join(self.prompt_dir, f"question{i}.txt")
126
+ question_text = read_file(question_file)
127
+
128
+ if i == 1:
129
+ prompt = f"{question_text}\n\n####\n{base_prompt}\n####"
130
+ else:
131
+ prompt = f"\nHere is your review so far:\n{' '.join(previous_responses)}\n\nHere are your reviewer instructions. Please answer the following question:\n{question_text}"
132
+
133
+ truncated_prompt = self.truncate_content(prompt)
134
+ logging.info(f"Processing prompt for question {i}")
135
+
136
+ response = self.call_model(truncated_prompt, self.model)
137
+ if response is None:
138
+ response = "N/A"
139
+
140
+ if i in [2, 3, 4, 10]:
141
+ number_match = re.search(r'\b\d+\b', response)
142
+ if number_match:
143
+ number = int(number_match.group(0))
144
+ response = '5/5' if number > 5 else number_match.group(0) + '/5'
145
+ elif i == 9:
146
+ number_match = re.search(r'\b\d+\b', response)
147
+ if number_match:
148
+ response = number_match.group(0) + '/10'
149
+
150
+ response_with_header = f"{header[i-1]} {response}"
151
+ review_output.append(response_with_header)
152
+ previous_responses.append(response)
153
+
154
+ end_time = time.time()
155
+ elapsed_time = end_time - start_time
156
+ print(f"Time taken to process paper: {elapsed_time:.2f} seconds")
157
+ return review_output
158
+
requirements.txt ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ altair==5.3.0
3
+ annotated-types==0.6.0
4
+ anthropic==0.25.8
5
+ anyio==4.3.0
6
+ attrs==23.2.0
7
+ beautifulsoup4==4.12.3
8
+ boto3==1.34.103
9
+ botocore==1.34.103
10
+ cachetools==5.3.3
11
+ certifi==2024.2.2
12
+ charset-normalizer==3.3.2
13
+ click==8.1.7
14
+ cohere==5.4.0
15
+ colorama==0.4.6
16
+ contourpy==1.2.1
17
+ cycler==0.12.1
18
+ distro==1.9.0
19
+ dnspython==2.6.1
20
+ email_validator==2.1.1
21
+ fastapi==0.111.0
22
+ fastapi-cli==0.0.3
23
+ fastavro==1.9.4
24
+ ffmpy==0.3.2
25
+ filelock==3.14.0
26
+ fonttools==4.51.0
27
+ fsspec==2024.3.1
28
+ google==3.0.0
29
+ google-ai-generativelanguage==0.6.2
30
+ google-api-core==2.19.0
31
+ google-api-python-client==2.129.0
32
+ google-auth==2.29.0
33
+ google-auth-httplib2==0.2.0
34
+ google-generativeai==0.5.2
35
+ googleapis-common-protos==1.63.0
36
+ gradio==4.31.0
37
+ gradio_client==0.16.2
38
+ grpcio==1.63.0
39
+ grpcio-status==1.62.2
40
+ h11==0.14.0
41
+ httpcore==1.0.5
42
+ httplib2==0.22.0
43
+ httptools==0.6.1
44
+ httpx==0.27.0
45
+ httpx-sse==0.4.0
46
+ huggingface-hub==0.23.0
47
+ idna==3.7
48
+ importlib_resources==6.4.0
49
+ Jinja2==3.1.4
50
+ jmespath==1.0.1
51
+ jsonschema==4.22.0
52
+ jsonschema-specifications==2023.12.1
53
+ kiwisolver==1.4.5
54
+ markdown-it-py==3.0.0
55
+ MarkupSafe==2.1.5
56
+ matplotlib==3.8.4
57
+ mdurl==0.1.2
58
+ numpy==1.26.4
59
+ openai==1.28.1
60
+ orjson==3.10.3
61
+ packaging==24.0
62
+ pandas==2.2.2
63
+ pillow==10.3.0
64
+ proto-plus==1.23.0
65
+ protobuf==4.25.3
66
+ pyasn1==0.6.0
67
+ pyasn1_modules==0.4.0
68
+ pydantic==2.7.1
69
+ pydantic_core==2.18.2
70
+ pydub==0.25.1
71
+ Pygments==2.18.0
72
+ PyMuPDF==1.24.3
73
+ PyMuPDFb==1.24.3
74
+ pyparsing==3.1.2
75
+ python-dateutil==2.9.0.post0
76
+ python-dotenv==1.0.1
77
+ python-multipart==0.0.9
78
+ pytz==2024.1
79
+ PyYAML==6.0.1
80
+ referencing==0.35.1
81
+ regex==2024.5.10
82
+ requests==2.31.0
83
+ rich==13.7.1
84
+ rpds-py==0.18.1
85
+ rsa==4.9
86
+ ruff==0.4.4
87
+ s3transfer==0.10.1
88
+ semantic-version==2.10.0
89
+ shellingham==1.5.4
90
+ six==1.16.0
91
+ sniffio==1.3.1
92
+ soupsieve==2.5
93
+ starlette==0.37.2
94
+ tiktoken==0.6.0
95
+ tokenizers==0.19.1
96
+ tomlkit==0.12.0
97
+ toolz==0.12.1
98
+ tqdm==4.66.4
99
+ typer==0.12.3
100
+ types-requests==2.31.0.20240406
101
+ typing_extensions==4.11.0
102
+ tzdata==2024.1
103
+ ujson==5.9.0
104
+ uritemplate==4.1.1
105
+ urllib3==2.2.1
106
+ uvicorn==0.29.0
107
+ watchfiles==0.21.0
108
+ websockets==11.0.3
utils.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ import os
3
+ import logging
4
+ import random
5
+ from models import Paper, PaperProcessor
6
+
7
+ def extract_text_from_pdf(filename):
8
+ with fitz.open(filename) as pdf_document:
9
+ text = ""
10
+ for page in pdf_document:
11
+ text += page.get_text()
12
+ return text.encode('latin-1', 'replace').decode('latin-1')
13
+
14
+ def process_paper(pdf_file, paper_dir, prompt_dir, api_keys):
15
+ logging.info(f"Processing file type in process_paper: {type(pdf_file)}") # Log the type of the file here as well
16
+ logging.debug(f"Starting to process paper: {pdf_file}")
17
+ # Ensure the directory exists
18
+ os.makedirs(paper_dir, exist_ok=True)
19
+
20
+ # Handle file based on its type
21
+ if isinstance(pdf_file, str):
22
+ # Assume pdf_file is a path to the PDF file
23
+ pdf_path = pdf_file
24
+ elif hasattr(pdf_file, 'name') and hasattr(pdf_file, 'read'):
25
+ # It's a file-like object
26
+ pdf_path = os.path.join(paper_dir, pdf_file.name)
27
+ with open(pdf_path, "wb") as f:
28
+ f.write(pdf_file.read())
29
+ else:
30
+ logging.error("Received object is neither a path nor a file-like object.")
31
+ return []
32
+
33
+ # Extract text from the PDF
34
+ extracted_text = extract_text_from_pdf(pdf_path)
35
+ paper = Paper(pdf_file.name if hasattr(pdf_file, 'name') else os.path.basename(pdf_path), extracted_text)
36
+
37
+ # Randomly select two models
38
+ models = ['gpt', 'claude', 'gemini', 'commandr']
39
+ selected_models = random.sample(models, 2)
40
+
41
+ # Process the paper with each selected model
42
+ reviews = []
43
+ for model in selected_models:
44
+ processor = PaperProcessor(prompt_dir, model, **api_keys)
45
+ review_text = processor.process_paper(paper)
46
+ #review_dict = {section.split(':')[0]: section.split(':')[1].strip() for section in review_text}
47
+ reviews.append(review_text)
48
+ logging.debug(f"Reviews generated: {reviews}")
49
+ return reviews