Spaces:

Ki-Seki
/

AutoTab

Running

App Files Files Community

Ki-Seki commited on Jul 26, 2024

Commit

55638b2

1 Parent(s): ddab079

init

Browse files

Files changed (5) hide show

LICENSE +21 -0
README.md +4 -2
app.py +58 -0
autotab.py +119 -0
requirements.txt +5 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Ki Seki
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: AutoTab
 emoji: 😻
 colorFrom: gray
 colorTo: yellow
@@ -9,4 +9,6 @@ app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: AutoTabularCompletion
 emoji: 😻
 colorFrom: gray
 colorTo: yellow
 pinned: false
 ---
+Automatically complete missing output values in tabular data based on in-context learning.
+Visit https://github.com/Ki-Seki/autotab for more information.

app.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import gradio as gr
+from autotab import AutoTab
+import json
+def auto_tabulator_completion(
+    in_file,
+    instruction,
+    max_examples,
+    model_name,
+    generation_config,
+    save_every,
+):
+    output_file_name = "ouput.xlsx"
+    autotab = AutoTab(
+        in_file_path=in_file.name,
+        instruction=instruction,
+        out_file_path=output_file_name,
+        max_examples=max_examples,
+        model_name=model_name,
+        api_key="sk-exhahhjfqyanmwewndukcqtrpegfdbwszkjucvcpajdufiah",
+        base_url="https://public-beta-api.siliconflow.cn/v1",
+        generation_config=json.loads(generation_config),
+        save_every=save_every,
+    )
+    autotab.run()
+    return output_file_name, autotab.data[:15]
+# Gradio interface
+inputs = [
+    gr.File(label="Input Excel File"),
+    gr.Textbox(
+        value="You are a helpful assistant. Help me finish the task.",
+        label="Instruction",
+    ),
+    gr.Slider(value=5, minimum=1, maximum=100, label="Max Examples"),
+    gr.Textbox(value="Qwen/Qwen2-7B-Instruct", label="Model Name"),
+    gr.Textbox(
+        value='{"temperature": 0, "max_tokens": 128}',
+        label="Generation Config in Dict",
+    ),
+    gr.Slider(value=10, minimum=1, maximum=1000, label="Save Every N Steps"),
+]
+outputs = [
+    gr.File(label="Output Excel File"),
+    gr.Dataframe(label="First 15 rows."),
+]
+gr.Interface(
+    fn=auto_tabulator_completion,
+    inputs=inputs,
+    outputs=outputs,
+    title="Auto Tabulator Completion",
+    description="Automatically complete missing output values in tabular data based on in-context learning.",
+).launch()

autotab.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import re
+import openai
+import pandas as pd
+from tqdm import tqdm
+class AutoTab:
+    def __init__(
+        self,
+        in_file_path: str,
+        out_file_path: str,
+        max_examples: int,
+        model_name: str,
+        api_key: str,
+        base_url: str,
+        generation_config: dict,
+        save_every: int,
+        instruction: str,
+    ):
+        self.in_file_path = in_file_path
+        self.out_file_path = out_file_path
+        self.max_examples = max_examples
+        self.model_name = model_name
+        self.api_key = api_key
+        self.base_url = base_url
+        self.generation_config = generation_config
+        self.save_every = save_every
+        self.instruction = instruction
+    # ─── IO ───────────────────────────────────────────────────────────────
+    def load_excel(self) -> tuple[pd.DataFrame, list, list]:
+        """Load the Excel file and identify input and output fields."""
+        df = pd.read_excel(self.in_file_path)
+        input_fields = [col for col in df.columns if col.startswith("[Input] ")]
+        output_fields = [col for col in df.columns if col.startswith("[Output] ")]
+        return df, input_fields, output_fields
+    # ─── LLM ──────────────────────────────────────────────────────────────
+    def openai_request(self, query: str) -> str:
+        """Make a request to an OpenAI-format API."""
+        client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
+        response = client.chat.completions.create(
+            model=self.model_name,
+            messages=[{"role": "user", "content": query}],
+            **self.generation_config,
+        )
+        str_response = response.choices[0].message.content.strip()
+        return str_response
+    # ─── In-Context Learning ──────────────────────────────────────────────
+    def derive_incontext(
+        self, data: pd.DataFrame, input_columns: list[str], output_columns: list[str]
+    ) -> str:
+        """Derive the in-context prompt with angle brackets."""
+        n = min(self.max_examples, len(data.dropna(subset=output_columns)))
+        in_context = ""
+        for i in range(n):
+            in_context += "".join(
+                f"<{col.replace('[Input] ', '')}>{data[col].iloc[i]}</{col.replace('[Input] ', '')}>\n"
+                for col in input_columns
+            )
+            in_context += "".join(
+                f"<{col.replace('[Output] ', '')}>{data[col].iloc[i]}</{col.replace('[Output] ', '')}>\n"
+                for col in output_columns
+            )
+            in_context += "\n"
+        self.in_context = in_context
+        return in_context
+    def predict_output(
+        self, in_context: str, input_data: pd.DataFrame, input_fields: str
+    ):
+        """Predict the output values for the given input data using the API."""
+        query = (
+            self.instruction
+            + "\n\n"
+            + in_context
+            + "".join(
+                f"<{col.replace('[Input] ', '')}>{input_data[col]}</{col.replace('[Input] ', '')}>\n"
+                for col in input_fields
+            )
+        )
+        self.query_example = query
+        output = self.openai_request(query)
+        return output
+    def extract_fields(
+        self, response: str, output_columns: list[str]
+    ) -> dict[str, str]:
+        """Extract fields from the response text based on output columns."""
+        extracted = {}
+        for col in output_columns:
+            field = col.replace("[Output] ", "")
+            match = re.search(f"<{field}>(.*?)</{field}>", response)
+            extracted[col] = match.group(1) if match else ""
+        return extracted
+    # ─── Engine ───────────────────────────────────────────────────────────
+    def run(self):
+        data, input_fields, output_fields = self.load_excel()
+        in_context = self.derive_incontext(data, input_fields, output_fields)
+        num_existed_examples = len(data.dropna(subset=output_fields))
+        for i in tqdm(range(num_existed_examples, len(data))):
+            prediction = self.predict_output(in_context, data.iloc[i], input_fields)
+            extracted_fields = self.extract_fields(prediction, output_fields)
+            for field_name in output_fields:
+                data.at[i, field_name] = extracted_fields.get(field_name, "")
+            if i % self.save_every == 0:
+                data.to_excel(self.out_file_path, index=False)
+        self.data = data
+        data.to_excel(self.out_file_path, index=False)
+        print(f"Results saved to {self.out_file_path}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+pandas
+openai
+argparse
+openpyxl
+gradio