Ki-Seki commited on
Commit
55638b2
β€’
1 Parent(s): ddab079
Files changed (5) hide show
  1. LICENSE +21 -0
  2. README.md +4 -2
  3. app.py +58 -0
  4. autotab.py +119 -0
  5. requirements.txt +5 -0
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Ki Seki
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: AutoTab
3
  emoji: 😻
4
  colorFrom: gray
5
  colorTo: yellow
@@ -9,4 +9,6 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ title: AutoTabularCompletion
3
  emoji: 😻
4
  colorFrom: gray
5
  colorTo: yellow
 
9
  pinned: false
10
  ---
11
 
12
+ Automatically complete missing output values in tabular data based on in-context learning.
13
+
14
+ Visit https://github.com/Ki-Seki/autotab for more information.
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from autotab import AutoTab
4
+ import json
5
+
6
+
7
+ def auto_tabulator_completion(
8
+ in_file,
9
+ instruction,
10
+ max_examples,
11
+ model_name,
12
+ generation_config,
13
+ save_every,
14
+ ):
15
+ output_file_name = "ouput.xlsx"
16
+ autotab = AutoTab(
17
+ in_file_path=in_file.name,
18
+ instruction=instruction,
19
+ out_file_path=output_file_name,
20
+ max_examples=max_examples,
21
+ model_name=model_name,
22
+ api_key="sk-exhahhjfqyanmwewndukcqtrpegfdbwszkjucvcpajdufiah",
23
+ base_url="https://public-beta-api.siliconflow.cn/v1",
24
+ generation_config=json.loads(generation_config),
25
+ save_every=save_every,
26
+ )
27
+ autotab.run()
28
+ return output_file_name, autotab.data[:15]
29
+
30
+
31
+ # Gradio interface
32
+ inputs = [
33
+ gr.File(label="Input Excel File"),
34
+ gr.Textbox(
35
+ value="You are a helpful assistant. Help me finish the task.",
36
+ label="Instruction",
37
+ ),
38
+ gr.Slider(value=5, minimum=1, maximum=100, label="Max Examples"),
39
+ gr.Textbox(value="Qwen/Qwen2-7B-Instruct", label="Model Name"),
40
+ gr.Textbox(
41
+ value='{"temperature": 0, "max_tokens": 128}',
42
+ label="Generation Config in Dict",
43
+ ),
44
+ gr.Slider(value=10, minimum=1, maximum=1000, label="Save Every N Steps"),
45
+ ]
46
+
47
+ outputs = [
48
+ gr.File(label="Output Excel File"),
49
+ gr.Dataframe(label="First 15 rows."),
50
+ ]
51
+
52
+ gr.Interface(
53
+ fn=auto_tabulator_completion,
54
+ inputs=inputs,
55
+ outputs=outputs,
56
+ title="Auto Tabulator Completion",
57
+ description="Automatically complete missing output values in tabular data based on in-context learning.",
58
+ ).launch()
autotab.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ import openai
4
+ import pandas as pd
5
+ from tqdm import tqdm
6
+
7
+
8
+ class AutoTab:
9
+ def __init__(
10
+ self,
11
+ in_file_path: str,
12
+ out_file_path: str,
13
+ max_examples: int,
14
+ model_name: str,
15
+ api_key: str,
16
+ base_url: str,
17
+ generation_config: dict,
18
+ save_every: int,
19
+ instruction: str,
20
+ ):
21
+ self.in_file_path = in_file_path
22
+ self.out_file_path = out_file_path
23
+ self.max_examples = max_examples
24
+ self.model_name = model_name
25
+ self.api_key = api_key
26
+ self.base_url = base_url
27
+ self.generation_config = generation_config
28
+ self.save_every = save_every
29
+ self.instruction = instruction
30
+
31
+ # ─── IO ───────────────────────────────────────────────────────────────
32
+
33
+ def load_excel(self) -> tuple[pd.DataFrame, list, list]:
34
+ """Load the Excel file and identify input and output fields."""
35
+ df = pd.read_excel(self.in_file_path)
36
+ input_fields = [col for col in df.columns if col.startswith("[Input] ")]
37
+ output_fields = [col for col in df.columns if col.startswith("[Output] ")]
38
+ return df, input_fields, output_fields
39
+
40
+ # ─── LLM ──────────────────────────────────────────────────────────────
41
+
42
+ def openai_request(self, query: str) -> str:
43
+ """Make a request to an OpenAI-format API."""
44
+ client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
45
+ response = client.chat.completions.create(
46
+ model=self.model_name,
47
+ messages=[{"role": "user", "content": query}],
48
+ **self.generation_config,
49
+ )
50
+ str_response = response.choices[0].message.content.strip()
51
+ return str_response
52
+
53
+ # ─── In-Context Learning ──────────────────────────────────────────────
54
+
55
+ def derive_incontext(
56
+ self, data: pd.DataFrame, input_columns: list[str], output_columns: list[str]
57
+ ) -> str:
58
+ """Derive the in-context prompt with angle brackets."""
59
+ n = min(self.max_examples, len(data.dropna(subset=output_columns)))
60
+ in_context = ""
61
+ for i in range(n):
62
+ in_context += "".join(
63
+ f"<{col.replace('[Input] ', '')}>{data[col].iloc[i]}</{col.replace('[Input] ', '')}>\n"
64
+ for col in input_columns
65
+ )
66
+ in_context += "".join(
67
+ f"<{col.replace('[Output] ', '')}>{data[col].iloc[i]}</{col.replace('[Output] ', '')}>\n"
68
+ for col in output_columns
69
+ )
70
+ in_context += "\n"
71
+ self.in_context = in_context
72
+ return in_context
73
+
74
+ def predict_output(
75
+ self, in_context: str, input_data: pd.DataFrame, input_fields: str
76
+ ):
77
+ """Predict the output values for the given input data using the API."""
78
+ query = (
79
+ self.instruction
80
+ + "\n\n"
81
+ + in_context
82
+ + "".join(
83
+ f"<{col.replace('[Input] ', '')}>{input_data[col]}</{col.replace('[Input] ', '')}>\n"
84
+ for col in input_fields
85
+ )
86
+ )
87
+ self.query_example = query
88
+ output = self.openai_request(query)
89
+ return output
90
+
91
+ def extract_fields(
92
+ self, response: str, output_columns: list[str]
93
+ ) -> dict[str, str]:
94
+ """Extract fields from the response text based on output columns."""
95
+ extracted = {}
96
+ for col in output_columns:
97
+ field = col.replace("[Output] ", "")
98
+ match = re.search(f"<{field}>(.*?)</{field}>", response)
99
+ extracted[col] = match.group(1) if match else ""
100
+ return extracted
101
+
102
+ # ─── Engine ───────────────────────────────────────────────────────────
103
+
104
+ def run(self):
105
+ data, input_fields, output_fields = self.load_excel()
106
+ in_context = self.derive_incontext(data, input_fields, output_fields)
107
+
108
+ num_existed_examples = len(data.dropna(subset=output_fields))
109
+
110
+ for i in tqdm(range(num_existed_examples, len(data))):
111
+ prediction = self.predict_output(in_context, data.iloc[i], input_fields)
112
+ extracted_fields = self.extract_fields(prediction, output_fields)
113
+ for field_name in output_fields:
114
+ data.at[i, field_name] = extracted_fields.get(field_name, "")
115
+ if i % self.save_every == 0:
116
+ data.to_excel(self.out_file_path, index=False)
117
+ self.data = data
118
+ data.to_excel(self.out_file_path, index=False)
119
+ print(f"Results saved to {self.out_file_path}")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pandas
2
+ openai
3
+ argparse
4
+ openpyxl
5
+ gradio