Spaces:
Sleeping
Sleeping
openreviewer
commited on
Commit
•
25f01d1
1
Parent(s):
0bf9463
Upload folder using huggingface_hub
Browse files- .gitattributes +16 -16
- .github/workflows/deploy.yml +53 -53
- .gitignore +2 -2
- app.py +254 -168
- file_utils.py +2 -2
- iclr2024/question11.txt +6 -6
- iclr2024/systemrole.txt +10 -10
- logging_config.py +8 -8
- models.py +158 -158
- requirements.txt +108 -108
- utils.py +45 -49
.gitattributes
CHANGED
@@ -1,16 +1,16 @@
|
|
1 |
-
# HIDE ALL OF THE FILES IN THE DIRECTORY
|
2 |
-
*.py
|
3 |
-
*.log
|
4 |
-
*.md
|
5 |
-
*.txt
|
6 |
-
iclr2024/**
|
7 |
-
*.github/**
|
8 |
-
*.gitignore
|
9 |
-
*.gitattributes
|
10 |
-
*.git/**
|
11 |
-
*.__pycache__/**
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
1 |
+
# HIDE ALL OF THE FILES IN THE DIRECTORY
|
2 |
+
*.py
|
3 |
+
*.log
|
4 |
+
*.md
|
5 |
+
*.txt
|
6 |
+
iclr2024/**
|
7 |
+
*.github/**
|
8 |
+
*.gitignore
|
9 |
+
*.gitattributes
|
10 |
+
*.git/**
|
11 |
+
*.__pycache__/**
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
|
.github/workflows/deploy.yml
CHANGED
@@ -1,54 +1,54 @@
|
|
1 |
-
name: Deploy Gradio App
|
2 |
-
|
3 |
-
on:
|
4 |
-
push:
|
5 |
-
branches:
|
6 |
-
- main
|
7 |
-
|
8 |
-
jobs:
|
9 |
-
deploy:
|
10 |
-
runs-on: ubuntu-latest
|
11 |
-
|
12 |
-
steps:
|
13 |
-
- name: Checkout code
|
14 |
-
uses: actions/checkout@v3
|
15 |
-
|
16 |
-
- name: Set up Python
|
17 |
-
uses: actions/setup-python@v4
|
18 |
-
with:
|
19 |
-
python-version: '3.12.3' # Specify the Python version you are using
|
20 |
-
|
21 |
-
- name: Install dependencies
|
22 |
-
run: |
|
23 |
-
python -m pip install --upgrade pip
|
24 |
-
pip install -r requirements.txt # Ensure you have a requirements.txt file
|
25 |
-
|
26 |
-
- name: Login to Hugging Face
|
27 |
-
env:
|
28 |
-
HUGGINGFACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
|
29 |
-
run: |
|
30 |
-
huggingface-cli login --token $HUGGINGFACE_TOKEN
|
31 |
-
|
32 |
-
- name: Deploy Gradio App
|
33 |
-
env:
|
34 |
-
HUGGINGFACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
|
35 |
-
run: gradio deploy
|
36 |
-
# - name: Upload to Hugging Face Spaces
|
37 |
-
# env:
|
38 |
-
# HF_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
|
39 |
-
# run: |
|
40 |
-
# git lfs install
|
41 |
-
# huggingface-cli lfs-enable-largefiles .
|
42 |
-
# huggingface-cli repo create reviewerarena/reviewer-arena --type=space
|
43 |
-
# huggingface-cli repo upload reviewerarena/reviewer-arena . --all-yes
|
44 |
-
# - name: Login to Hugging Face
|
45 |
-
# env:
|
46 |
-
# HUGGINGFACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
|
47 |
-
# run: |
|
48 |
-
# echo "$HUGGINGFACE_TOKEN" | huggingface-cli login --token
|
49 |
-
|
50 |
-
# - name: Deploy Gradio App
|
51 |
-
# env:
|
52 |
-
# HUGGINGFACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
|
53 |
-
# run: |
|
54 |
# gradio deploy --token $HUGGINGFACE_TOKEN
|
|
|
1 |
+
name: Deploy Gradio App
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- main
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
deploy:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
|
12 |
+
steps:
|
13 |
+
- name: Checkout code
|
14 |
+
uses: actions/checkout@v3
|
15 |
+
|
16 |
+
- name: Set up Python
|
17 |
+
uses: actions/setup-python@v4
|
18 |
+
with:
|
19 |
+
python-version: '3.12.3' # Specify the Python version you are using
|
20 |
+
|
21 |
+
- name: Install dependencies
|
22 |
+
run: |
|
23 |
+
python -m pip install --upgrade pip
|
24 |
+
pip install -r requirements.txt # Ensure you have a requirements.txt file
|
25 |
+
|
26 |
+
- name: Login to Hugging Face
|
27 |
+
env:
|
28 |
+
HUGGINGFACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
|
29 |
+
run: |
|
30 |
+
huggingface-cli login --token $HUGGINGFACE_TOKEN
|
31 |
+
|
32 |
+
- name: Deploy Gradio App
|
33 |
+
env:
|
34 |
+
HUGGINGFACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
|
35 |
+
run: gradio deploy
|
36 |
+
# - name: Upload to Hugging Face Spaces
|
37 |
+
# env:
|
38 |
+
# HF_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
|
39 |
+
# run: |
|
40 |
+
# git lfs install
|
41 |
+
# huggingface-cli lfs-enable-largefiles .
|
42 |
+
# huggingface-cli repo create reviewerarena/reviewer-arena --type=space
|
43 |
+
# huggingface-cli repo upload reviewerarena/reviewer-arena . --all-yes
|
44 |
+
# - name: Login to Hugging Face
|
45 |
+
# env:
|
46 |
+
# HUGGINGFACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
|
47 |
+
# run: |
|
48 |
+
# echo "$HUGGINGFACE_TOKEN" | huggingface-cli login --token
|
49 |
+
|
50 |
+
# - name: Deploy Gradio App
|
51 |
+
# env:
|
52 |
+
# HUGGINGFACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
|
53 |
+
# run: |
|
54 |
# gradio deploy --token $HUGGINGFACE_TOKEN
|
.gitignore
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
-
my-venv/
|
2 |
-
old/
|
3 |
arena.log
|
|
|
1 |
+
my-venv/
|
2 |
+
old/
|
3 |
arena.log
|
app.py
CHANGED
@@ -1,168 +1,254 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
from utils import process_paper
|
3 |
-
import os
|
4 |
-
import logging
|
5 |
-
import html
|
6 |
-
from logging_config import setup_logging
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
'
|
15 |
-
'
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
"
|
43 |
-
"
|
44 |
-
"
|
45 |
-
"
|
46 |
-
"
|
47 |
-
"
|
48 |
-
"
|
49 |
-
"
|
50 |
-
"
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
"
|
56 |
-
"
|
57 |
-
"
|
58 |
-
"
|
59 |
-
"
|
60 |
-
"
|
61 |
-
"
|
62 |
-
"
|
63 |
-
"
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
for review in reviews:
|
70 |
-
|
71 |
-
for
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
margin-bottom:
|
94 |
-
|
95 |
-
background-color: #
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
}
|
128 |
-
"""
|
129 |
-
with gr.Blocks(css=css) as demo:
|
130 |
-
gr.
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from utils import process_paper
|
3 |
+
import os
|
4 |
+
import logging
|
5 |
+
import html
|
6 |
+
from logging_config import setup_logging
|
7 |
+
|
8 |
+
setup_logging()
|
9 |
+
paper_dir = 'path_to_temp_storage'
|
10 |
+
prompt_dir = 'iclr2024'
|
11 |
+
api_keys = {
|
12 |
+
'openai_api_key': os.environ.get('openai_api_key'),
|
13 |
+
'claude_api_key': os.environ.get('anthropic_api_key'),
|
14 |
+
'gemini_api_key': os.environ.get('google_api_key'),
|
15 |
+
'commandr_api_key': os.environ.get('cohere_api_key')
|
16 |
+
}
|
17 |
+
|
18 |
+
use_real_api = False
|
19 |
+
|
20 |
+
|
21 |
+
def review_papers(pdf_file):
|
22 |
+
logging.info(f"Received file type: {type(pdf_file)}")
|
23 |
+
if use_real_api:
|
24 |
+
reviews, selected_models = process_paper(
|
25 |
+
pdf_file, paper_dir, prompt_dir, api_keys)
|
26 |
+
processed_reviews = []
|
27 |
+
for review in reviews:
|
28 |
+
processed_review = {}
|
29 |
+
for section in review:
|
30 |
+
if ':' in section:
|
31 |
+
key, value = section.split(':', 1)
|
32 |
+
processed_value = value.strip().replace('\n', '<br>')
|
33 |
+
processed_review[key.strip()] = html.escape(
|
34 |
+
processed_value)
|
35 |
+
processed_reviews.append(processed_review)
|
36 |
+
reviews = processed_reviews
|
37 |
+
else:
|
38 |
+
reviews = [
|
39 |
+
{
|
40 |
+
"Summary": "This is a placeholder review for Model 1. The paper explores advanced methodologies in reinforcement learning applied to autonomous driving systems, proposing significant enhancements to decision-making algorithms that could improve safety and operational efficiency. The authors provide a detailed analysis of the current limitations of existing systems and suggest innovative solutions that could transform the field.",
|
41 |
+
"Soundness": "The assumptions underlying the proposed enhancements are occasionally not fully justified, particularly concerning the scalability of the algorithms under varied and unpredictable traffic conditions. A more rigorous examination of these assumptions is necessary to solidify the paper's foundation.",
|
42 |
+
"Presentation": "While the paper is structured adequately, some sections delve into technical details that are not sufficiently elucidated for a broader audience. This could potentially limit the paper's impact and accessibility, making it challenging for non-specialists to fully grasp the implications of the research.",
|
43 |
+
"Contribution": "The paper makes a moderate contribution to the existing body of knowledge, offering incremental improvements over current methodologies rather than a completely novel approach. However, these improvements are significant and could lead to better practical implementations in the field of autonomous driving.",
|
44 |
+
"Strengths": "The initial results presented in the paper are promising, showing potential for the proposed methods. The inclusion of real-world data in the preliminary experiments adds a layer of credibility and relevance to the results, showcasing the practical applicability of the research.",
|
45 |
+
"Weaknesses": "The paper lacks detailed exposition on the methodology, particularly in how the algorithms adapt to unexpected or novel scenarios. This is a critical area that requires further development and testing to ensure the robustness and reliability of the proposed solutions.",
|
46 |
+
"Questions/Suggestions": "The statistical analysis section could be enhanced by incorporating more robust statistical techniques and a wider array of metrics. Additionally, conducting tests in a variety of driving environments could help in substantiating the claims made and strengthen the overall findings of the research.",
|
47 |
+
"Ethics Review": "The research complies with all ethical standards, addressing potential ethical issues related to autonomous driving comprehensively. Issues such as privacy concerns, decision-making in critical situations, and the overall impact on societal norms are discussed and handled with the utmost care.",
|
48 |
+
"Overall Score": "3/5",
|
49 |
+
"Confidence": "Confidence in the findings is moderate. While the initial results are encouraging, the limited scope of testing and some unresolved questions regarding scalability and robustness temper the confidence in these results.",
|
50 |
+
"Code of Conduct": "There are no violations of the code of conduct noted. The research upholds ethical standards and maintains transparency in methodologies and data usage, contributing to its integrity and the trustworthiness of the findings."
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"Summary": "This is a placeholder review for Model 2. The paper explores advanced methodologies in reinforcement learning applied to autonomous driving systems, proposing significant enhancements to decision-making algorithms that could improve safety and operational efficiency. The authors provide a detailed analysis of the current limitations of existing systems and suggest innovative solutions that could transform the field.",
|
54 |
+
"Soundness": "The assumptions underlying the proposed enhancements are occasionally not fully justified, particularly concerning the scalability of the algorithms under varied and unpredictable traffic conditions. A more rigorous examination of these assumptions is necessary to solidify the paper's foundation.",
|
55 |
+
"Presentation": "While the paper is structured adequately, some sections delve into technical details that are not sufficiently elucidated for a broader audience. This could potentially limit the paper's impact and accessibility, making it challenging for non-specialists to fully grasp the implications of the research.",
|
56 |
+
"Contribution": "The paper makes a moderate contribution to the existing body of knowledge, offering incremental improvements over current methodologies rather than a completely novel approach. However, these improvements are significant and could lead to better practical implementations in the field of autonomous driving.",
|
57 |
+
"Strengths": "The initial results presented in the paper are promising, showing potential for the proposed methods. The inclusion of real-world data in the preliminary experiments adds a layer of credibility and relevance to the results, showcasing the practical applicability of the research.",
|
58 |
+
"Weaknesses": "The paper lacks detailed exposition on the methodology, particularly in how the algorithms adapt to unexpected or novel scenarios. This is a critical area that requires further development and testing to ensure the robustness and reliability of the proposed solutions.",
|
59 |
+
"Questions/Suggestions": "The statistical analysis section could be enhanced by incorporating more robust statistical techniques and a wider array of metrics. Additionally, conducting tests in a variety of driving environments could help in substantiating the claims made and strengthen the overall findings of the research.",
|
60 |
+
"Ethics Review": "The research complies with all ethical standards, addressing potential ethical issues related to autonomous driving comprehensively. Issues such as privacy concerns, decision-making in critical situations, and the overall impact on societal norms are discussed and handled with the utmost care.",
|
61 |
+
"Overall Score": "3/5",
|
62 |
+
"Confidence": "Confidence in the findings is moderate. While the initial results are encouraging, the limited scope of testing and some unresolved questions regarding scalability and robustness temper the confidence in these results.",
|
63 |
+
"Code of Conduct": "There are no violations of the code of conduct noted. The research upholds ethical standards and maintains transparency in methodologies and data usage, contributing to its integrity and the trustworthiness of the findings."
|
64 |
+
}
|
65 |
+
]
|
66 |
+
selected_models = ['model1-placeholder', 'model2-placeholder']
|
67 |
+
|
68 |
+
review_texts = []
|
69 |
+
for review in reviews:
|
70 |
+
formatted_review = "<div class='review-container'>"
|
71 |
+
for section, content in review.items():
|
72 |
+
formatted_review += f"<div class='review-section'><strong>{section}:</strong> <span>{html.unescape(content)}</span></div>"
|
73 |
+
formatted_review += "</div>"
|
74 |
+
review_texts.append(formatted_review)
|
75 |
+
|
76 |
+
model_a = selected_models[0]
|
77 |
+
model_b = selected_models[1]
|
78 |
+
|
79 |
+
logging.debug(f"Final formatted reviews: {review_texts}")
|
80 |
+
return review_texts[0], review_texts[1], gr.update(visible=True), gr.update(visible=True), model_a, model_b
|
81 |
+
|
82 |
+
|
83 |
+
def setup_interface():
|
84 |
+
logging.debug("Setting up Gradio interface.")
|
85 |
+
css = """
|
86 |
+
.review-container {
|
87 |
+
padding: 10px;
|
88 |
+
margin-bottom: 20px;
|
89 |
+
border: 1px solid #ccc;
|
90 |
+
background-color: #f9f9f9;
|
91 |
+
}
|
92 |
+
.review-section {
|
93 |
+
margin-bottom: 12px;
|
94 |
+
padding: 8px;
|
95 |
+
background-color: #ffffff;
|
96 |
+
border-left: 4px solid #007BFF;
|
97 |
+
padding-left: 10px;
|
98 |
+
}
|
99 |
+
.review-section strong {
|
100 |
+
color: #333;
|
101 |
+
font-weight: bold;
|
102 |
+
display: block;
|
103 |
+
margin-bottom: 5px;
|
104 |
+
}
|
105 |
+
.review-section span, .gr-markdown {
|
106 |
+
color: #000;
|
107 |
+
font-size: 14px;
|
108 |
+
line-height: 1.5;
|
109 |
+
display: block;
|
110 |
+
white-space: normal;
|
111 |
+
opacity: 1;
|
112 |
+
}
|
113 |
+
.model-label {
|
114 |
+
font-size: 18px;
|
115 |
+
font-weight: bold;
|
116 |
+
color: #007BFF;
|
117 |
+
margin-bottom: 10px;
|
118 |
+
}
|
119 |
+
.gr-file, .gr-button, .gr-radio {
|
120 |
+
width: 300px;
|
121 |
+
margin: auto;
|
122 |
+
}
|
123 |
+
.gr-button-small {
|
124 |
+
width: 150px;
|
125 |
+
height: 40px;
|
126 |
+
font-size: 16px;
|
127 |
+
}
|
128 |
+
"""
|
129 |
+
with gr.Blocks(css=css) as demo:
|
130 |
+
with gr.Tabs():
|
131 |
+
with gr.TabItem("Reviewer Arena"):
|
132 |
+
gr.Markdown("## Reviewer Arena")
|
133 |
+
gr.Markdown(
|
134 |
+
"Upload an academic paper to get reviews from two randomly selected LLMs.")
|
135 |
+
with gr.Row():
|
136 |
+
file_input = gr.File(label="Upload Academic Paper")
|
137 |
+
submit_button = gr.Button(
|
138 |
+
"Submit!", elem_id="submit-button")
|
139 |
+
with gr.Row():
|
140 |
+
with gr.Column():
|
141 |
+
gr.HTML("<div class='model-label'>Model A</div>")
|
142 |
+
review1 = gr.Markdown()
|
143 |
+
with gr.Column():
|
144 |
+
gr.HTML("<div class='model-label'>Model B</div>")
|
145 |
+
review2 = gr.Markdown()
|
146 |
+
|
147 |
+
vote_options = ["👍 A is better",
|
148 |
+
"👍 B is better", "👔 Tie", "👎 Both are bad"]
|
149 |
+
vote = gr.Radio(label="Vote on the best model",
|
150 |
+
choices=vote_options, value="Tie", visible=False)
|
151 |
+
vote_button = gr.Button("Submit Vote", visible=False)
|
152 |
+
vote_message = gr.HTML("", visible=False)
|
153 |
+
another_paper_button = gr.Button(
|
154 |
+
"Review another paper", visible=False)
|
155 |
+
|
156 |
+
model_identity_message = gr.HTML("", visible=False)
|
157 |
+
|
158 |
+
def handle_vote(vote, model_a, model_b):
|
159 |
+
print(f"Vote received: {vote}")
|
160 |
+
message = f"<p>Thank you for your vote!</p><p>Model A: {model_a}</p><p>Model B: {model_b}</p>"
|
161 |
+
return gr.update(value=message, visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
|
162 |
+
|
163 |
+
vote_button.click(fn=handle_vote, inputs=[vote, model_identity_message, model_identity_message], outputs=[
|
164 |
+
vote_message, vote, vote_button, another_paper_button])
|
165 |
+
|
166 |
+
submit_button.click(
|
167 |
+
fn=review_papers,
|
168 |
+
inputs=[file_input],
|
169 |
+
outputs=[review1, review2, vote, vote_button,
|
170 |
+
model_identity_message, model_identity_message]
|
171 |
+
)
|
172 |
+
|
173 |
+
another_paper_button.click(
|
174 |
+
fn=lambda: None, inputs=None, outputs=None, js="() => { location.reload(); }")
|
175 |
+
with gr.TabItem("Leaderboard"):
|
176 |
+
gr.Markdown("## Leaderboard")
|
177 |
+
leaderboard_html = """
|
178 |
+
<table style="width:100%; border: 1px solid #444; border-collapse: collapse; font-family: Arial, sans-serif; background-color: #2b2b2b;">
|
179 |
+
<thead>
|
180 |
+
<tr style="border: 1px solid #444; padding: 12px; background-color: #1a1a1a;">
|
181 |
+
<th style="border: 1px solid #444; padding: 12px; color: #ddd;">Rank</th>
|
182 |
+
<th style="border: 1px solid #444; padding: 12px; color: #ddd;">Model</th>
|
183 |
+
<th style="border: 1px solid #444; padding: 12px; color: #ddd;">Arena Elo</th>
|
184 |
+
<th style="border: 1px solid #444; padding: 12px; color: #ddd;">95% CI</th>
|
185 |
+
<th style="border: 1px solid #444; padding: 12px; color: #ddd;">Votes</th>
|
186 |
+
<th style="border: 1px solid #444; padding: 12px; color: #ddd;">Organization</th>
|
187 |
+
<th style="border: 1px solid #444; padding: 12px; color: #ddd;">License</th>
|
188 |
+
<th style="border: 1px solid #444; padding: 12px; color: #ddd;">Knowledge Cutoff</th>
|
189 |
+
</tr>
|
190 |
+
</thead>
|
191 |
+
<tbody>
|
192 |
+
<tr style="border: 1px solid #444; padding: 12px;">
|
193 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">1</td>
|
194 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">GPT-4-Turbo-2024-04-09</td>
|
195 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">1258</td>
|
196 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">+3/-3</td>
|
197 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">44592</td>
|
198 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">OpenAI</td>
|
199 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">Proprietary</td>
|
200 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">2023/12</td>
|
201 |
+
</tr>
|
202 |
+
<tr style="border: 1px solid #444; padding: 12px;">
|
203 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">2</td>
|
204 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">GPT-4-1106-preview</td>
|
205 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">1252</td>
|
206 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">+2/-3</td>
|
207 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">76173</td>
|
208 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">OpenAI</td>
|
209 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">Proprietary</td>
|
210 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">2023/4</td>
|
211 |
+
</tr>
|
212 |
+
<tr style="border: 1px solid #444; padding: 12px;">
|
213 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">2</td>
|
214 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">Gemini 1.5 Pro API-0409-Preview</td>
|
215 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">1249</td>
|
216 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">+3/-3</td>
|
217 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">61011</td>
|
218 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">Google</td>
|
219 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">Proprietary</td>
|
220 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">2023/11</td>
|
221 |
+
</tr>
|
222 |
+
<tr style="border: 1px solid #444; padding: 12px;">
|
223 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">2</td>
|
224 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">Claude 3 Opus</td>
|
225 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">1248</td>
|
226 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">+2/-2</td>
|
227 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">101063</td>
|
228 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">Anthropic</td>
|
229 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">Proprietary</td>
|
230 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">2023/8</td>
|
231 |
+
</tr>
|
232 |
+
<tr style="border: 1px solid #444; padding: 12px;">
|
233 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">3</td>
|
234 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">GPT-4-0125-preview</td>
|
235 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">1246</td>
|
236 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">+3/-2</td>
|
237 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">70239</td>
|
238 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">OpenAI</td>
|
239 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">Proprietary</td>
|
240 |
+
<td style="border: 1px solid #444; padding: 12px; color: #ddd;">2023/12</td>
|
241 |
+
</tr>
|
242 |
+
</tbody>
|
243 |
+
</table>
|
244 |
+
"""
|
245 |
+
gr.HTML(leaderboard_html)
|
246 |
+
|
247 |
+
logging.debug("Gradio interface setup complete.")
|
248 |
+
return demo
|
249 |
+
|
250 |
+
|
251 |
+
if __name__ == "__main__":
|
252 |
+
logging.basicConfig(level=logging.INFO)
|
253 |
+
demo = setup_interface()
|
254 |
+
demo.launch()
|
file_utils.py
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
-
def read_file(file_path):
|
2 |
-
with open(file_path, 'r', encoding='utf-8') as f:
|
3 |
return f.read()
|
|
|
1 |
+
def read_file(file_path):
|
2 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
3 |
return f.read()
|
iclr2024/question11.txt
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
-
If there are no violations of the Code of Conduct with this paper, please respond with NO. Otherwise, if this paper violates the Code of Conduct, please indicate the relevant section(s) from the following options:
|
2 |
-
|
3 |
-
Yes, Harassment, bullying, or discrimination based on personal characteristics
|
4 |
-
Yes, Inappropriate physical contact, sexual harassment, or unwelcome sexual attention
|
5 |
-
Yes, Offensive comments related to gender, race, religion, or other protected characteristics
|
6 |
-
Yes, Disruption of talks or other events, or behavior interfering with participation
|
7 |
Yes, Inappropriate use of imagery, language, or personal attacks in virtual interactions
|
|
|
1 |
+
If there are no violations of the Code of Conduct with this paper, please respond with NO. Otherwise, if this paper violates the Code of Conduct, please indicate the relevant section(s) from the following options:
|
2 |
+
|
3 |
+
Yes, Harassment, bullying, or discrimination based on personal characteristics
|
4 |
+
Yes, Inappropriate physical contact, sexual harassment, or unwelcome sexual attention
|
5 |
+
Yes, Offensive comments related to gender, race, religion, or other protected characteristics
|
6 |
+
Yes, Disruption of talks or other events, or behavior interfering with participation
|
7 |
Yes, Inappropriate use of imagery, language, or personal attacks in virtual interactions
|
iclr2024/systemrole.txt
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
-
You are a very critical but fair peer reviewer. You will be provided with papers submitted to a conference/journal to review. The papers will be delimited with #### characters.
|
2 |
-
|
3 |
-
We are aiming for a 20-25% acceptance rate. Average score thresholds of 5.5-5.7 roughly correspond to acceptance rates of 25%-20%. It is certainly possible to both accept papers below this threshold and reject papers above it. But any such decision should be properly explained.
|
4 |
-
|
5 |
-
The statistics for the previous year was: A total of 3422 submissions were received. The average score of all submissions was 5.47 with standard deviation 1.30, with scores ranging from 1.00 to 9.00. Aim for a similar distribution of scores and use the full range of scores between 1-10.
|
6 |
-
|
7 |
-
Out of all submissions, 32% (1095 submissions) were accepted, with scores ranging from 4.50 to 9.00 and an average score of 6.61 with a standard deviation of 0.75. Only 2.1% (55 submissions) were accepted for oral presentation, with scores ranging from 5.00 to 9.00 and an average score of 7.80 with a standard deviation of 0.63.
|
8 |
-
|
9 |
-
6.64% (174 submissions) were selected for the spotlight, with scores ranging from 5.60 to 8.60 and an average score of 7.33 with a standard deviation of 0.58. 33.04% (866 submissions) were accepted for poster presentation, with scores ranging from 4.50 to 8.00 and an average score of 6.39 with a standard deviation of 0.61.
|
10 |
-
|
11 |
60.36% (1582 submissions) were rejected, with scores ranging from 1.00 to 7.50 and an average score of 4.69 with a standard deviation of 0.97. Additionally, 775 submissions were withdrawn and 26 were desk rejected.
|
|
|
1 |
+
You are a very critical but fair peer reviewer. You will be provided with papers submitted to a conference/journal to review. The papers will be delimited with #### characters.
|
2 |
+
|
3 |
+
We are aiming for a 20-25% acceptance rate. Average score thresholds of 5.5-5.7 roughly correspond to acceptance rates of 25%-20%. It is certainly possible to both accept papers below this threshold and reject papers above it. But any such decision should be properly explained.
|
4 |
+
|
5 |
+
The statistics for the previous year was: A total of 3422 submissions were received. The average score of all submissions was 5.47 with standard deviation 1.30, with scores ranging from 1.00 to 9.00. Aim for a similar distribution of scores and use the full range of scores between 1-10.
|
6 |
+
|
7 |
+
Out of all submissions, 32% (1095 submissions) were accepted, with scores ranging from 4.50 to 9.00 and an average score of 6.61 with a standard deviation of 0.75. Only 2.1% (55 submissions) were accepted for oral presentation, with scores ranging from 5.00 to 9.00 and an average score of 7.80 with a standard deviation of 0.63.
|
8 |
+
|
9 |
+
6.64% (174 submissions) were selected for the spotlight, with scores ranging from 5.60 to 8.60 and an average score of 7.33 with a standard deviation of 0.58. 33.04% (866 submissions) were accepted for poster presentation, with scores ranging from 4.50 to 8.00 and an average score of 6.39 with a standard deviation of 0.61.
|
10 |
+
|
11 |
60.36% (1582 submissions) were rejected, with scores ranging from 1.00 to 7.50 and an average score of 4.69 with a standard deviation of 0.97. Additionally, 775 submissions were withdrawn and 26 were desk rejected.
|
logging_config.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
-
import logging
|
2 |
-
|
3 |
-
def setup_logging():
|
4 |
-
logging.basicConfig(
|
5 |
-
filename="arena.log",
|
6 |
-
level=logging.DEBUG, # Change to DEBUG level
|
7 |
-
format='%(asctime)s - %(levelname)s - %(message)s'
|
8 |
-
)
|
9 |
logging.info("Logging setup complete.")
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
def setup_logging():
|
4 |
+
logging.basicConfig(
|
5 |
+
filename="arena.log",
|
6 |
+
level=logging.DEBUG, # Change to DEBUG level
|
7 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
8 |
+
)
|
9 |
logging.info("Logging setup complete.")
|
models.py
CHANGED
@@ -1,158 +1,158 @@
|
|
1 |
-
import os
|
2 |
-
import logging
|
3 |
-
import openai
|
4 |
-
import tiktoken
|
5 |
-
import re
|
6 |
-
import anthropic
|
7 |
-
import cohere
|
8 |
-
import google.generativeai as genai
|
9 |
-
import time
|
10 |
-
from file_utils import read_file
|
11 |
-
from openai import OpenAI
|
12 |
-
|
13 |
-
class Paper:
|
14 |
-
def __init__(self, arxiv_id, tex_file):
|
15 |
-
self.arxiv_id = arxiv_id
|
16 |
-
self.tex_file = tex_file
|
17 |
-
|
18 |
-
class PaperProcessor:
|
19 |
-
MAX_TOKENS = 127192
|
20 |
-
encoding = tiktoken.encoding_for_model("gpt-4-0125-preview")
|
21 |
-
|
22 |
-
def __init__(self, prompt_dir, model, openai_api_key, claude_api_key, gemini_api_key, commandr_api_key):
|
23 |
-
self.prompt_dir = prompt_dir
|
24 |
-
self.model = model
|
25 |
-
self.openai_api_key = openai_api_key
|
26 |
-
self.claude_api_key = claude_api_key
|
27 |
-
self.gemini_api_key = gemini_api_key
|
28 |
-
self.commandr_api_key = commandr_api_key
|
29 |
-
|
30 |
-
def count_tokens(self, text):
|
31 |
-
return len(self.encoding.encode(text))
|
32 |
-
|
33 |
-
def truncate_content(self, content):
|
34 |
-
token_count = self.count_tokens(content)
|
35 |
-
logging.debug(f"Token count before truncation: {token_count}")
|
36 |
-
if token_count > self.MAX_TOKENS:
|
37 |
-
tokens = self.encoding.encode(content)
|
38 |
-
truncated_tokens = tokens[:self.MAX_TOKENS]
|
39 |
-
truncated_content = self.encoding.decode(truncated_tokens)
|
40 |
-
logging.debug(f"Content truncated. Token count after truncation: {self.count_tokens(truncated_content)}")
|
41 |
-
return truncated_content
|
42 |
-
return content
|
43 |
-
|
44 |
-
def prepare_base_prompt(self, paper):
|
45 |
-
return paper.tex_file
|
46 |
-
|
47 |
-
def call_model(self, prompt, model_type):
|
48 |
-
system_role_file_path = os.path.join(self.prompt_dir, "systemrole.txt")
|
49 |
-
if not os.path.exists(system_role_file_path):
|
50 |
-
logging.error(f"System role file not found: {system_role_file_path}")
|
51 |
-
return None
|
52 |
-
|
53 |
-
system_role = read_file(system_role_file_path)
|
54 |
-
logging.debug(f"Token count of full prompt: {self.count_tokens(prompt)}")
|
55 |
-
logging.info(f"Sending the following prompt to {model_type}: {prompt}")
|
56 |
-
|
57 |
-
try:
|
58 |
-
if model_type == 'gpt':
|
59 |
-
client = OpenAI(api_key=self.openai_api_key)
|
60 |
-
messages = [{"role": "system", "content": system_role}, {"role": "user", "content": prompt}]
|
61 |
-
completion = client.chat.completions.create(
|
62 |
-
model="gpt-4-turbo-2024-04-09",
|
63 |
-
messages=messages,
|
64 |
-
temperature=1
|
65 |
-
)
|
66 |
-
return completion.choices[0].message.content.strip()
|
67 |
-
|
68 |
-
elif model_type == 'claude':
|
69 |
-
client = anthropic.Anthropic(api_key=self.claude_api_key)
|
70 |
-
response = client.messages.create(
|
71 |
-
model='claude-3-opus-20240229',
|
72 |
-
max_tokens=4096,
|
73 |
-
system=system_role,
|
74 |
-
temperature=0.5,
|
75 |
-
messages=[{"role": "user", "content": prompt}]
|
76 |
-
)
|
77 |
-
return response.content[0].text
|
78 |
-
|
79 |
-
elif model_type == 'commandr':
|
80 |
-
co = cohere.Client(self.commandr_api_key)
|
81 |
-
response = co.chat(
|
82 |
-
model="command-r-plus",
|
83 |
-
message=prompt,
|
84 |
-
preamble=system_role
|
85 |
-
)
|
86 |
-
return response.text
|
87 |
-
|
88 |
-
elif model_type == 'gemini':
|
89 |
-
genai.configure(api_key=self.gemini_api_key)
|
90 |
-
model = genai.GenerativeModel('gemini-pro')
|
91 |
-
response = model.generate_content(prompt)
|
92 |
-
return response.candidates[0].content.parts[0].text
|
93 |
-
|
94 |
-
except Exception as e:
|
95 |
-
logging.error(f"Exception occurred: {e}")
|
96 |
-
return None
|
97 |
-
|
98 |
-
def is_content_appropriate(self, content):
|
99 |
-
try:
|
100 |
-
response = openai.moderations.create(input=content)
|
101 |
-
return not response["results"][0]["flagged"]
|
102 |
-
except Exception as e:
|
103 |
-
logging.error(f"Exception occurred while checking content appropriateness: {e}")
|
104 |
-
return True # In case of an error, default to content being appropriate
|
105 |
-
|
106 |
-
def get_prompt_files(self, prompt_dir):
|
107 |
-
return [f for f in os.listdir(prompt_dir) if f.endswith('.txt') and f.startswith('question')]
|
108 |
-
|
109 |
-
def process_paper(self, paper):
|
110 |
-
openai.api_key = self.openai_api_key
|
111 |
-
start_time = time.time()
|
112 |
-
|
113 |
-
base_prompt = self.prepare_base_prompt(paper)
|
114 |
-
if base_prompt is None:
|
115 |
-
return "Error: Base prompt could not be prepared."
|
116 |
-
|
117 |
-
moderation_response = openai.moderations.create(input=base_prompt)
|
118 |
-
if moderation_response.results[0].flagged:
|
119 |
-
return ["Desk Rejected", "The paper contains inappropriate or harmful content."]
|
120 |
-
|
121 |
-
review_output = []
|
122 |
-
previous_responses = []
|
123 |
-
header = ['Summary:', 'Soundness:', 'Presentation:', 'Contribution:', 'Strengths:', 'Weaknesses:', 'Questions:', 'Flag For Ethics Review:', 'Rating:', 'Confidence:', 'Code Of Conduct:']
|
124 |
-
for i in range(1, 12):
|
125 |
-
question_file = os.path.join(self.prompt_dir, f"question{i}.txt")
|
126 |
-
question_text = read_file(question_file)
|
127 |
-
|
128 |
-
if i == 1:
|
129 |
-
prompt = f"{question_text}\n\n####\n{base_prompt}\n####"
|
130 |
-
else:
|
131 |
-
prompt = f"\nHere is your review so far:\n{' '.join(previous_responses)}\n\nHere are your reviewer instructions. Please answer the following question:\n{question_text}"
|
132 |
-
|
133 |
-
truncated_prompt = self.truncate_content(prompt)
|
134 |
-
logging.info(f"Processing prompt for question {i}")
|
135 |
-
|
136 |
-
response = self.call_model(truncated_prompt, self.model)
|
137 |
-
if response is None:
|
138 |
-
response = "N/A"
|
139 |
-
|
140 |
-
if i in [2, 3, 4, 10]:
|
141 |
-
number_match = re.search(r'\b\d+\b', response)
|
142 |
-
if number_match:
|
143 |
-
number = int(number_match.group(0))
|
144 |
-
response = '5/5' if number > 5 else number_match.group(0) + '/5'
|
145 |
-
elif i == 9:
|
146 |
-
number_match = re.search(r'\b\d+\b', response)
|
147 |
-
if number_match:
|
148 |
-
response = number_match.group(0) + '/10'
|
149 |
-
|
150 |
-
response_with_header = f"{header[i-1]} {response}"
|
151 |
-
review_output.append(response_with_header)
|
152 |
-
previous_responses.append(response)
|
153 |
-
|
154 |
-
end_time = time.time()
|
155 |
-
elapsed_time = end_time - start_time
|
156 |
-
print(f"Time taken to process paper: {elapsed_time:.2f} seconds")
|
157 |
-
return review_output
|
158 |
-
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
import openai
|
4 |
+
import tiktoken
|
5 |
+
import re
|
6 |
+
import anthropic
|
7 |
+
import cohere
|
8 |
+
import google.generativeai as genai
|
9 |
+
import time
|
10 |
+
from file_utils import read_file
|
11 |
+
from openai import OpenAI
|
12 |
+
|
13 |
+
class Paper:
|
14 |
+
def __init__(self, arxiv_id, tex_file):
|
15 |
+
self.arxiv_id = arxiv_id
|
16 |
+
self.tex_file = tex_file
|
17 |
+
|
18 |
+
class PaperProcessor:
|
19 |
+
MAX_TOKENS = 127192
|
20 |
+
encoding = tiktoken.encoding_for_model("gpt-4-0125-preview")
|
21 |
+
|
22 |
+
def __init__(self, prompt_dir, model, openai_api_key, claude_api_key, gemini_api_key, commandr_api_key):
|
23 |
+
self.prompt_dir = prompt_dir
|
24 |
+
self.model = model
|
25 |
+
self.openai_api_key = openai_api_key
|
26 |
+
self.claude_api_key = claude_api_key
|
27 |
+
self.gemini_api_key = gemini_api_key
|
28 |
+
self.commandr_api_key = commandr_api_key
|
29 |
+
|
30 |
+
def count_tokens(self, text):
|
31 |
+
return len(self.encoding.encode(text))
|
32 |
+
|
33 |
+
def truncate_content(self, content):
|
34 |
+
token_count = self.count_tokens(content)
|
35 |
+
logging.debug(f"Token count before truncation: {token_count}")
|
36 |
+
if token_count > self.MAX_TOKENS:
|
37 |
+
tokens = self.encoding.encode(content)
|
38 |
+
truncated_tokens = tokens[:self.MAX_TOKENS]
|
39 |
+
truncated_content = self.encoding.decode(truncated_tokens)
|
40 |
+
logging.debug(f"Content truncated. Token count after truncation: {self.count_tokens(truncated_content)}")
|
41 |
+
return truncated_content
|
42 |
+
return content
|
43 |
+
|
44 |
+
def prepare_base_prompt(self, paper):
|
45 |
+
return paper.tex_file
|
46 |
+
|
47 |
+
def call_model(self, prompt, model_type):
|
48 |
+
system_role_file_path = os.path.join(self.prompt_dir, "systemrole.txt")
|
49 |
+
if not os.path.exists(system_role_file_path):
|
50 |
+
logging.error(f"System role file not found: {system_role_file_path}")
|
51 |
+
return None
|
52 |
+
|
53 |
+
system_role = read_file(system_role_file_path)
|
54 |
+
logging.debug(f"Token count of full prompt: {self.count_tokens(prompt)}")
|
55 |
+
logging.info(f"Sending the following prompt to {model_type}: {prompt}")
|
56 |
+
|
57 |
+
try:
|
58 |
+
if model_type == 'gpt':
|
59 |
+
client = OpenAI(api_key=self.openai_api_key)
|
60 |
+
messages = [{"role": "system", "content": system_role}, {"role": "user", "content": prompt}]
|
61 |
+
completion = client.chat.completions.create(
|
62 |
+
model="gpt-4-turbo-2024-04-09",
|
63 |
+
messages=messages,
|
64 |
+
temperature=1
|
65 |
+
)
|
66 |
+
return completion.choices[0].message.content.strip()
|
67 |
+
|
68 |
+
elif model_type == 'claude':
|
69 |
+
client = anthropic.Anthropic(api_key=self.claude_api_key)
|
70 |
+
response = client.messages.create(
|
71 |
+
model='claude-3-opus-20240229',
|
72 |
+
max_tokens=4096,
|
73 |
+
system=system_role,
|
74 |
+
temperature=0.5,
|
75 |
+
messages=[{"role": "user", "content": prompt}]
|
76 |
+
)
|
77 |
+
return response.content[0].text
|
78 |
+
|
79 |
+
elif model_type == 'commandr':
|
80 |
+
co = cohere.Client(self.commandr_api_key)
|
81 |
+
response = co.chat(
|
82 |
+
model="command-r-plus",
|
83 |
+
message=prompt,
|
84 |
+
preamble=system_role
|
85 |
+
)
|
86 |
+
return response.text
|
87 |
+
|
88 |
+
elif model_type == 'gemini':
|
89 |
+
genai.configure(api_key=self.gemini_api_key)
|
90 |
+
model = genai.GenerativeModel('gemini-pro')
|
91 |
+
response = model.generate_content(prompt)
|
92 |
+
return response.candidates[0].content.parts[0].text
|
93 |
+
|
94 |
+
except Exception as e:
|
95 |
+
logging.error(f"Exception occurred: {e}")
|
96 |
+
return None
|
97 |
+
|
98 |
+
def is_content_appropriate(self, content):
|
99 |
+
try:
|
100 |
+
response = openai.moderations.create(input=content)
|
101 |
+
return not response["results"][0]["flagged"]
|
102 |
+
except Exception as e:
|
103 |
+
logging.error(f"Exception occurred while checking content appropriateness: {e}")
|
104 |
+
return True # In case of an error, default to content being appropriate
|
105 |
+
|
106 |
+
def get_prompt_files(self, prompt_dir):
|
107 |
+
return [f for f in os.listdir(prompt_dir) if f.endswith('.txt') and f.startswith('question')]
|
108 |
+
|
109 |
+
def process_paper(self, paper):
|
110 |
+
openai.api_key = self.openai_api_key
|
111 |
+
start_time = time.time()
|
112 |
+
|
113 |
+
base_prompt = self.prepare_base_prompt(paper)
|
114 |
+
if base_prompt is None:
|
115 |
+
return "Error: Base prompt could not be prepared."
|
116 |
+
|
117 |
+
moderation_response = openai.moderations.create(input=base_prompt)
|
118 |
+
if moderation_response.results[0].flagged:
|
119 |
+
return ["Desk Rejected", "The paper contains inappropriate or harmful content."]
|
120 |
+
|
121 |
+
review_output = []
|
122 |
+
previous_responses = []
|
123 |
+
header = ['Summary:', 'Soundness:', 'Presentation:', 'Contribution:', 'Strengths:', 'Weaknesses:', 'Questions:', 'Flag For Ethics Review:', 'Rating:', 'Confidence:', 'Code Of Conduct:']
|
124 |
+
for i in range(1, 12):
|
125 |
+
question_file = os.path.join(self.prompt_dir, f"question{i}.txt")
|
126 |
+
question_text = read_file(question_file)
|
127 |
+
|
128 |
+
if i == 1:
|
129 |
+
prompt = f"{question_text}\n\n####\n{base_prompt}\n####"
|
130 |
+
else:
|
131 |
+
prompt = f"\nHere is your review so far:\n{' '.join(previous_responses)}\n\nHere are your reviewer instructions. Please answer the following question:\n{question_text}"
|
132 |
+
|
133 |
+
truncated_prompt = self.truncate_content(prompt)
|
134 |
+
logging.info(f"Processing prompt for question {i}")
|
135 |
+
|
136 |
+
response = self.call_model(truncated_prompt, self.model)
|
137 |
+
if response is None:
|
138 |
+
response = "N/A"
|
139 |
+
|
140 |
+
if i in [2, 3, 4, 10]:
|
141 |
+
number_match = re.search(r'\b\d+\b', response)
|
142 |
+
if number_match:
|
143 |
+
number = int(number_match.group(0))
|
144 |
+
response = '5/5' if number > 5 else number_match.group(0) + '/5'
|
145 |
+
elif i == 9:
|
146 |
+
number_match = re.search(r'\b\d+\b', response)
|
147 |
+
if number_match:
|
148 |
+
response = number_match.group(0) + '/10'
|
149 |
+
|
150 |
+
response_with_header = f"{header[i-1]} {response}"
|
151 |
+
review_output.append(response_with_header)
|
152 |
+
previous_responses.append(response)
|
153 |
+
|
154 |
+
end_time = time.time()
|
155 |
+
elapsed_time = end_time - start_time
|
156 |
+
print(f"Time taken to process paper: {elapsed_time:.2f} seconds")
|
157 |
+
return review_output
|
158 |
+
|
requirements.txt
CHANGED
@@ -1,108 +1,108 @@
|
|
1 |
-
aiofiles==23.2.1
|
2 |
-
altair==5.3.0
|
3 |
-
annotated-types==0.6.0
|
4 |
-
anthropic==0.25.8
|
5 |
-
anyio==4.3.0
|
6 |
-
attrs==23.2.0
|
7 |
-
beautifulsoup4==4.12.3
|
8 |
-
boto3==1.34.103
|
9 |
-
botocore==1.34.103
|
10 |
-
cachetools==5.3.3
|
11 |
-
certifi==2024.2.2
|
12 |
-
charset-normalizer==3.3.2
|
13 |
-
click==8.1.7
|
14 |
-
cohere==5.4.0
|
15 |
-
colorama==0.4.6
|
16 |
-
contourpy==1.2.1
|
17 |
-
cycler==0.12.1
|
18 |
-
distro==1.9.0
|
19 |
-
dnspython==2.6.1
|
20 |
-
email_validator==2.1.1
|
21 |
-
fastapi==0.111.0
|
22 |
-
fastapi-cli==0.0.3
|
23 |
-
fastavro==1.9.4
|
24 |
-
ffmpy==0.3.2
|
25 |
-
filelock==3.14.0
|
26 |
-
fonttools==4.51.0
|
27 |
-
fsspec==2024.3.1
|
28 |
-
google==3.0.0
|
29 |
-
google-ai-generativelanguage==0.6.2
|
30 |
-
google-api-core==2.19.0
|
31 |
-
google-api-python-client==2.129.0
|
32 |
-
google-auth==2.29.0
|
33 |
-
google-auth-httplib2==0.2.0
|
34 |
-
google-generativeai==0.5.2
|
35 |
-
googleapis-common-protos==1.63.0
|
36 |
-
gradio==4.31.0
|
37 |
-
gradio_client==0.16.2
|
38 |
-
grpcio==1.63.0
|
39 |
-
grpcio-status==1.62.2
|
40 |
-
h11==0.14.0
|
41 |
-
httpcore==1.0.5
|
42 |
-
httplib2==0.22.0
|
43 |
-
httptools==0.6.1
|
44 |
-
httpx==0.27.0
|
45 |
-
httpx-sse==0.4.0
|
46 |
-
huggingface-hub==0.23.0
|
47 |
-
idna==3.7
|
48 |
-
importlib_resources==6.4.0
|
49 |
-
Jinja2==3.1.4
|
50 |
-
jmespath==1.0.1
|
51 |
-
jsonschema==4.22.0
|
52 |
-
jsonschema-specifications==2023.12.1
|
53 |
-
kiwisolver==1.4.5
|
54 |
-
markdown-it-py==3.0.0
|
55 |
-
MarkupSafe==2.1.5
|
56 |
-
matplotlib==3.8.4
|
57 |
-
mdurl==0.1.2
|
58 |
-
numpy==1.26.4
|
59 |
-
openai==1.28.1
|
60 |
-
orjson==3.10.3
|
61 |
-
packaging==24.0
|
62 |
-
pandas==2.2.2
|
63 |
-
pillow==10.3.0
|
64 |
-
proto-plus==1.23.0
|
65 |
-
protobuf==4.25.3
|
66 |
-
pyasn1==0.6.0
|
67 |
-
pyasn1_modules==0.4.0
|
68 |
-
pydantic==2.7.1
|
69 |
-
pydantic_core==2.18.2
|
70 |
-
pydub==0.25.1
|
71 |
-
Pygments==2.18.0
|
72 |
-
PyMuPDF==1.24.3
|
73 |
-
PyMuPDFb==1.24.3
|
74 |
-
pyparsing==3.1.2
|
75 |
-
python-dateutil==2.9.0.post0
|
76 |
-
python-dotenv==1.0.1
|
77 |
-
python-multipart==0.0.9
|
78 |
-
pytz==2024.1
|
79 |
-
PyYAML==6.0.1
|
80 |
-
referencing==0.35.1
|
81 |
-
regex==2024.5.10
|
82 |
-
requests==2.31.0
|
83 |
-
rich==13.7.1
|
84 |
-
rpds-py==0.18.1
|
85 |
-
rsa==4.9
|
86 |
-
ruff==0.4.4
|
87 |
-
s3transfer==0.10.1
|
88 |
-
semantic-version==2.10.0
|
89 |
-
shellingham==1.5.4
|
90 |
-
six==1.16.0
|
91 |
-
sniffio==1.3.1
|
92 |
-
soupsieve==2.5
|
93 |
-
starlette==0.37.2
|
94 |
-
tiktoken==0.6.0
|
95 |
-
tokenizers==0.19.1
|
96 |
-
tomlkit==0.12.0
|
97 |
-
toolz==0.12.1
|
98 |
-
tqdm==4.66.4
|
99 |
-
typer==0.12.3
|
100 |
-
types-requests==2.31.0.20240406
|
101 |
-
typing_extensions==4.11.0
|
102 |
-
tzdata==2024.1
|
103 |
-
ujson==5.9.0
|
104 |
-
uritemplate==4.1.1
|
105 |
-
urllib3==2.2.1
|
106 |
-
uvicorn==0.29.0
|
107 |
-
watchfiles==0.21.0
|
108 |
-
websockets==11.0.3
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
altair==5.3.0
|
3 |
+
annotated-types==0.6.0
|
4 |
+
anthropic==0.25.8
|
5 |
+
anyio==4.3.0
|
6 |
+
attrs==23.2.0
|
7 |
+
beautifulsoup4==4.12.3
|
8 |
+
boto3==1.34.103
|
9 |
+
botocore==1.34.103
|
10 |
+
cachetools==5.3.3
|
11 |
+
certifi==2024.2.2
|
12 |
+
charset-normalizer==3.3.2
|
13 |
+
click==8.1.7
|
14 |
+
cohere==5.4.0
|
15 |
+
colorama==0.4.6
|
16 |
+
contourpy==1.2.1
|
17 |
+
cycler==0.12.1
|
18 |
+
distro==1.9.0
|
19 |
+
dnspython==2.6.1
|
20 |
+
email_validator==2.1.1
|
21 |
+
fastapi==0.111.0
|
22 |
+
fastapi-cli==0.0.3
|
23 |
+
fastavro==1.9.4
|
24 |
+
ffmpy==0.3.2
|
25 |
+
filelock==3.14.0
|
26 |
+
fonttools==4.51.0
|
27 |
+
fsspec==2024.3.1
|
28 |
+
google==3.0.0
|
29 |
+
google-ai-generativelanguage==0.6.2
|
30 |
+
google-api-core==2.19.0
|
31 |
+
google-api-python-client==2.129.0
|
32 |
+
google-auth==2.29.0
|
33 |
+
google-auth-httplib2==0.2.0
|
34 |
+
google-generativeai==0.5.2
|
35 |
+
googleapis-common-protos==1.63.0
|
36 |
+
gradio==4.31.0
|
37 |
+
gradio_client==0.16.2
|
38 |
+
grpcio==1.63.0
|
39 |
+
grpcio-status==1.62.2
|
40 |
+
h11==0.14.0
|
41 |
+
httpcore==1.0.5
|
42 |
+
httplib2==0.22.0
|
43 |
+
httptools==0.6.1
|
44 |
+
httpx==0.27.0
|
45 |
+
httpx-sse==0.4.0
|
46 |
+
huggingface-hub==0.23.0
|
47 |
+
idna==3.7
|
48 |
+
importlib_resources==6.4.0
|
49 |
+
Jinja2==3.1.4
|
50 |
+
jmespath==1.0.1
|
51 |
+
jsonschema==4.22.0
|
52 |
+
jsonschema-specifications==2023.12.1
|
53 |
+
kiwisolver==1.4.5
|
54 |
+
markdown-it-py==3.0.0
|
55 |
+
MarkupSafe==2.1.5
|
56 |
+
matplotlib==3.8.4
|
57 |
+
mdurl==0.1.2
|
58 |
+
numpy==1.26.4
|
59 |
+
openai==1.28.1
|
60 |
+
orjson==3.10.3
|
61 |
+
packaging==24.0
|
62 |
+
pandas==2.2.2
|
63 |
+
pillow==10.3.0
|
64 |
+
proto-plus==1.23.0
|
65 |
+
protobuf==4.25.3
|
66 |
+
pyasn1==0.6.0
|
67 |
+
pyasn1_modules==0.4.0
|
68 |
+
pydantic==2.7.1
|
69 |
+
pydantic_core==2.18.2
|
70 |
+
pydub==0.25.1
|
71 |
+
Pygments==2.18.0
|
72 |
+
PyMuPDF==1.24.3
|
73 |
+
PyMuPDFb==1.24.3
|
74 |
+
pyparsing==3.1.2
|
75 |
+
python-dateutil==2.9.0.post0
|
76 |
+
python-dotenv==1.0.1
|
77 |
+
python-multipart==0.0.9
|
78 |
+
pytz==2024.1
|
79 |
+
PyYAML==6.0.1
|
80 |
+
referencing==0.35.1
|
81 |
+
regex==2024.5.10
|
82 |
+
requests==2.31.0
|
83 |
+
rich==13.7.1
|
84 |
+
rpds-py==0.18.1
|
85 |
+
rsa==4.9
|
86 |
+
ruff==0.4.4
|
87 |
+
s3transfer==0.10.1
|
88 |
+
semantic-version==2.10.0
|
89 |
+
shellingham==1.5.4
|
90 |
+
six==1.16.0
|
91 |
+
sniffio==1.3.1
|
92 |
+
soupsieve==2.5
|
93 |
+
starlette==0.37.2
|
94 |
+
tiktoken==0.6.0
|
95 |
+
tokenizers==0.19.1
|
96 |
+
tomlkit==0.12.0
|
97 |
+
toolz==0.12.1
|
98 |
+
tqdm==4.66.4
|
99 |
+
typer==0.12.3
|
100 |
+
types-requests==2.31.0.20240406
|
101 |
+
typing_extensions==4.11.0
|
102 |
+
tzdata==2024.1
|
103 |
+
ujson==5.9.0
|
104 |
+
uritemplate==4.1.1
|
105 |
+
urllib3==2.2.1
|
106 |
+
uvicorn==0.29.0
|
107 |
+
watchfiles==0.21.0
|
108 |
+
websockets==11.0.3
|
utils.py
CHANGED
@@ -1,49 +1,45 @@
|
|
1 |
-
import fitz
|
2 |
-
import os
|
3 |
-
import logging
|
4 |
-
import random
|
5 |
-
from models import Paper, PaperProcessor
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
if isinstance(pdf_file, str):
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
#review_dict = {section.split(':')[0]: section.split(':')[1].strip() for section in review_text}
|
47 |
-
reviews.append(review_text)
|
48 |
-
logging.debug(f"Reviews generated: {reviews}")
|
49 |
-
return reviews
|
|
|
1 |
+
import fitz
|
2 |
+
import os
|
3 |
+
import logging
|
4 |
+
import random
|
5 |
+
from models import Paper, PaperProcessor
|
6 |
+
|
7 |
+
|
8 |
+
def extract_text_from_pdf(filename):
|
9 |
+
with fitz.open(filename) as pdf_document:
|
10 |
+
text = ""
|
11 |
+
for page in pdf_document:
|
12 |
+
text += page.get_text()
|
13 |
+
return text.encode('latin-1', 'replace').decode('latin-1')
|
14 |
+
|
15 |
+
|
16 |
+
def process_paper(pdf_file, paper_dir, prompt_dir, api_keys):
|
17 |
+
logging.info(f"Processing file type in process_paper: {type(pdf_file)}")
|
18 |
+
logging.debug(f"Starting to process paper: {pdf_file}")
|
19 |
+
os.makedirs(paper_dir, exist_ok=True)
|
20 |
+
|
21 |
+
if isinstance(pdf_file, str):
|
22 |
+
pdf_path = pdf_file
|
23 |
+
elif hasattr(pdf_file, 'name') and hasattr(pdf_file, 'read'):
|
24 |
+
pdf_path = os.path.join(paper_dir, pdf_file.name)
|
25 |
+
with open(pdf_path, "wb") as f:
|
26 |
+
f.write(pdf_file.read())
|
27 |
+
else:
|
28 |
+
logging.error(
|
29 |
+
"Received object is neither a path nor a file-like object.")
|
30 |
+
return [], []
|
31 |
+
|
32 |
+
extracted_text = extract_text_from_pdf(pdf_path)
|
33 |
+
paper = Paper(pdf_file.name if hasattr(pdf_file, 'name')
|
34 |
+
else os.path.basename(pdf_path), extracted_text)
|
35 |
+
|
36 |
+
models = ['gpt', 'claude', 'gemini', 'commandr']
|
37 |
+
selected_models = random.sample(models, 2)
|
38 |
+
|
39 |
+
reviews = []
|
40 |
+
for model in selected_models:
|
41 |
+
processor = PaperProcessor(prompt_dir, model, **api_keys)
|
42 |
+
review_text = processor.process_paper(paper)
|
43 |
+
reviews.append(review_text)
|
44 |
+
logging.debug(f"Reviews generated: {reviews}")
|
45 |
+
return reviews, selected_models
|
|
|
|
|
|
|
|