init
Browse files- LICENSE +21 -0
- README.md +4 -2
- app.py +58 -0
- autotab.py +119 -0
- requirements.txt +5 -0
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2024 Ki Seki
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: π»
|
4 |
colorFrom: gray
|
5 |
colorTo: yellow
|
@@ -9,4 +9,6 @@ app_file: app.py
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
1 |
---
|
2 |
+
title: AutoTabularCompletion
|
3 |
emoji: π»
|
4 |
colorFrom: gray
|
5 |
colorTo: yellow
|
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
Automatically complete missing output values in tabular data based on in-context learning.
|
13 |
+
|
14 |
+
Visit https://github.com/Ki-Seki/autotab for more information.
|
app.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from autotab import AutoTab
|
4 |
+
import json
|
5 |
+
|
6 |
+
|
7 |
+
def auto_tabulator_completion(
|
8 |
+
in_file,
|
9 |
+
instruction,
|
10 |
+
max_examples,
|
11 |
+
model_name,
|
12 |
+
generation_config,
|
13 |
+
save_every,
|
14 |
+
):
|
15 |
+
output_file_name = "ouput.xlsx"
|
16 |
+
autotab = AutoTab(
|
17 |
+
in_file_path=in_file.name,
|
18 |
+
instruction=instruction,
|
19 |
+
out_file_path=output_file_name,
|
20 |
+
max_examples=max_examples,
|
21 |
+
model_name=model_name,
|
22 |
+
api_key="sk-exhahhjfqyanmwewndukcqtrpegfdbwszkjucvcpajdufiah",
|
23 |
+
base_url="https://public-beta-api.siliconflow.cn/v1",
|
24 |
+
generation_config=json.loads(generation_config),
|
25 |
+
save_every=save_every,
|
26 |
+
)
|
27 |
+
autotab.run()
|
28 |
+
return output_file_name, autotab.data[:15]
|
29 |
+
|
30 |
+
|
31 |
+
# Gradio interface
|
32 |
+
inputs = [
|
33 |
+
gr.File(label="Input Excel File"),
|
34 |
+
gr.Textbox(
|
35 |
+
value="You are a helpful assistant. Help me finish the task.",
|
36 |
+
label="Instruction",
|
37 |
+
),
|
38 |
+
gr.Slider(value=5, minimum=1, maximum=100, label="Max Examples"),
|
39 |
+
gr.Textbox(value="Qwen/Qwen2-7B-Instruct", label="Model Name"),
|
40 |
+
gr.Textbox(
|
41 |
+
value='{"temperature": 0, "max_tokens": 128}',
|
42 |
+
label="Generation Config in Dict",
|
43 |
+
),
|
44 |
+
gr.Slider(value=10, minimum=1, maximum=1000, label="Save Every N Steps"),
|
45 |
+
]
|
46 |
+
|
47 |
+
outputs = [
|
48 |
+
gr.File(label="Output Excel File"),
|
49 |
+
gr.Dataframe(label="First 15 rows."),
|
50 |
+
]
|
51 |
+
|
52 |
+
gr.Interface(
|
53 |
+
fn=auto_tabulator_completion,
|
54 |
+
inputs=inputs,
|
55 |
+
outputs=outputs,
|
56 |
+
title="Auto Tabulator Completion",
|
57 |
+
description="Automatically complete missing output values in tabular data based on in-context learning.",
|
58 |
+
).launch()
|
autotab.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
import openai
|
4 |
+
import pandas as pd
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
|
8 |
+
class AutoTab:
|
9 |
+
def __init__(
|
10 |
+
self,
|
11 |
+
in_file_path: str,
|
12 |
+
out_file_path: str,
|
13 |
+
max_examples: int,
|
14 |
+
model_name: str,
|
15 |
+
api_key: str,
|
16 |
+
base_url: str,
|
17 |
+
generation_config: dict,
|
18 |
+
save_every: int,
|
19 |
+
instruction: str,
|
20 |
+
):
|
21 |
+
self.in_file_path = in_file_path
|
22 |
+
self.out_file_path = out_file_path
|
23 |
+
self.max_examples = max_examples
|
24 |
+
self.model_name = model_name
|
25 |
+
self.api_key = api_key
|
26 |
+
self.base_url = base_url
|
27 |
+
self.generation_config = generation_config
|
28 |
+
self.save_every = save_every
|
29 |
+
self.instruction = instruction
|
30 |
+
|
31 |
+
# βββ IO βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
32 |
+
|
33 |
+
def load_excel(self) -> tuple[pd.DataFrame, list, list]:
|
34 |
+
"""Load the Excel file and identify input and output fields."""
|
35 |
+
df = pd.read_excel(self.in_file_path)
|
36 |
+
input_fields = [col for col in df.columns if col.startswith("[Input] ")]
|
37 |
+
output_fields = [col for col in df.columns if col.startswith("[Output] ")]
|
38 |
+
return df, input_fields, output_fields
|
39 |
+
|
40 |
+
# βββ LLM ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
41 |
+
|
42 |
+
def openai_request(self, query: str) -> str:
|
43 |
+
"""Make a request to an OpenAI-format API."""
|
44 |
+
client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
|
45 |
+
response = client.chat.completions.create(
|
46 |
+
model=self.model_name,
|
47 |
+
messages=[{"role": "user", "content": query}],
|
48 |
+
**self.generation_config,
|
49 |
+
)
|
50 |
+
str_response = response.choices[0].message.content.strip()
|
51 |
+
return str_response
|
52 |
+
|
53 |
+
# βββ In-Context Learning ββββββββββββββββββββββββββββββββββββββββββββββ
|
54 |
+
|
55 |
+
def derive_incontext(
|
56 |
+
self, data: pd.DataFrame, input_columns: list[str], output_columns: list[str]
|
57 |
+
) -> str:
|
58 |
+
"""Derive the in-context prompt with angle brackets."""
|
59 |
+
n = min(self.max_examples, len(data.dropna(subset=output_columns)))
|
60 |
+
in_context = ""
|
61 |
+
for i in range(n):
|
62 |
+
in_context += "".join(
|
63 |
+
f"<{col.replace('[Input] ', '')}>{data[col].iloc[i]}</{col.replace('[Input] ', '')}>\n"
|
64 |
+
for col in input_columns
|
65 |
+
)
|
66 |
+
in_context += "".join(
|
67 |
+
f"<{col.replace('[Output] ', '')}>{data[col].iloc[i]}</{col.replace('[Output] ', '')}>\n"
|
68 |
+
for col in output_columns
|
69 |
+
)
|
70 |
+
in_context += "\n"
|
71 |
+
self.in_context = in_context
|
72 |
+
return in_context
|
73 |
+
|
74 |
+
def predict_output(
|
75 |
+
self, in_context: str, input_data: pd.DataFrame, input_fields: str
|
76 |
+
):
|
77 |
+
"""Predict the output values for the given input data using the API."""
|
78 |
+
query = (
|
79 |
+
self.instruction
|
80 |
+
+ "\n\n"
|
81 |
+
+ in_context
|
82 |
+
+ "".join(
|
83 |
+
f"<{col.replace('[Input] ', '')}>{input_data[col]}</{col.replace('[Input] ', '')}>\n"
|
84 |
+
for col in input_fields
|
85 |
+
)
|
86 |
+
)
|
87 |
+
self.query_example = query
|
88 |
+
output = self.openai_request(query)
|
89 |
+
return output
|
90 |
+
|
91 |
+
def extract_fields(
|
92 |
+
self, response: str, output_columns: list[str]
|
93 |
+
) -> dict[str, str]:
|
94 |
+
"""Extract fields from the response text based on output columns."""
|
95 |
+
extracted = {}
|
96 |
+
for col in output_columns:
|
97 |
+
field = col.replace("[Output] ", "")
|
98 |
+
match = re.search(f"<{field}>(.*?)</{field}>", response)
|
99 |
+
extracted[col] = match.group(1) if match else ""
|
100 |
+
return extracted
|
101 |
+
|
102 |
+
# βββ Engine βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
103 |
+
|
104 |
+
def run(self):
|
105 |
+
data, input_fields, output_fields = self.load_excel()
|
106 |
+
in_context = self.derive_incontext(data, input_fields, output_fields)
|
107 |
+
|
108 |
+
num_existed_examples = len(data.dropna(subset=output_fields))
|
109 |
+
|
110 |
+
for i in tqdm(range(num_existed_examples, len(data))):
|
111 |
+
prediction = self.predict_output(in_context, data.iloc[i], input_fields)
|
112 |
+
extracted_fields = self.extract_fields(prediction, output_fields)
|
113 |
+
for field_name in output_fields:
|
114 |
+
data.at[i, field_name] = extracted_fields.get(field_name, "")
|
115 |
+
if i % self.save_every == 0:
|
116 |
+
data.to_excel(self.out_file_path, index=False)
|
117 |
+
self.data = data
|
118 |
+
data.to_excel(self.out_file_path, index=False)
|
119 |
+
print(f"Results saved to {self.out_file_path}")
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
openai
|
3 |
+
argparse
|
4 |
+
openpyxl
|
5 |
+
gradio
|