Spaces:

Ki-Seki
/

AutoTab

Running

App Files Files Community

Ki-Seki commited on Jul 29, 2024

Commit

4bc02fa

2 Parent(s): bc989cb c2c0cff

Merge branch 'master'

Browse files

Files changed (8) hide show

app.py +19 -9
assets/demo.png +0 -0
autotab.py +43 -37
data/ch_patent_input.xlsx +0 -0
data/ch_patent_output.xlsx +0 -0
data/en_qa_input.xlsx +0 -0
data/en_qa_output.xlsx +0 -0
demo.ipynb +111 -0

app.py CHANGED Viewed

@@ -15,10 +15,10 @@ def auto_tabulator_completion(
     generation_config: dict,
     request_interval: float,
     save_every: int,
-    api_key: str,
     base_url: str,
 ) -> tuple[str, str, str, pd.DataFrame]:
-    output_file_name = "ouput.xlsx"
     autotab = AutoTab(
         in_file_path=in_file_path,
         out_file_path=output_file_name,
@@ -28,14 +28,23 @@ def auto_tabulator_completion(
         generation_config=json.loads(generation_config),
         request_interval=request_interval,
         save_every=save_every,
-        api_key=api_key,
         base_url=base_url,
     )
     start = time.time()
     autotab.run()
-    time_taken = time.strftime("%H:%M:%S", time.gmtime(time.time() - start))
-    return time_taken, output_file_name, autotab.query_example, autotab.data[:15]
 # Gradio interface
@@ -45,7 +54,7 @@ inputs = [
         value="You are a helpful assistant. Help me finish the task.",
         label="Instruction",
     ),
-    gr.Slider(value=5, minimum=1, maximum=50, step=1, label="Max Examples"),
     gr.Textbox(value="Qwen/Qwen2-7B-Instruct", label="Model Name"),
     gr.Textbox(
         value='{"temperature": 0, "max_tokens": 128}',
@@ -54,13 +63,14 @@ inputs = [
     gr.Slider(value=0.1, minimum=0, maximum=10, label="Request Interval in Seconds"),
     gr.Slider(value=100, minimum=1, maximum=1000, step=1, label="Save Every N Steps"),
     gr.Textbox(
-        value="sk-exhahhjfqyanmwewndukcqtrpegfdbwszkjucvcpajdufiah", label="API Key"
     ),
     gr.Textbox(value="https://public-beta-api.siliconflow.cn/v1", label="Base URL"),
 ]
 outputs = [
-    gr.Textbox(label="Time Taken"),
     gr.File(label="Output Excel File"),
     gr.Textbox(label="Query Example"),
     gr.Dataframe(label="First 15 rows."),
@@ -71,5 +81,5 @@ gr.Interface(
     inputs=inputs,
     outputs=outputs,
     title="Auto Tabulator Completion",
-    description="Automatically complete missing output values in tabular data based on in-context learning.",
 ).launch()

     generation_config: dict,
     request_interval: float,
     save_every: int,
+    str_api_keys: str,
     base_url: str,
 ) -> tuple[str, str, str, pd.DataFrame]:
+    output_file_name = f"output_{time.strftime('%Y%m%d%H%M%S')}.xlsx"
     autotab = AutoTab(
         in_file_path=in_file_path,
         out_file_path=output_file_name,
         generation_config=json.loads(generation_config),
         request_interval=request_interval,
         save_every=save_every,
+        api_keys=str_api_keys.split(),
         base_url=base_url,
     )
     start = time.time()
     autotab.run()
+    time_taken = time.time() - start
+    report = f"Total data points: {autotab.num_data}\n" + \
+            f"Total missing (before): {autotab.num_missing}\n" + \
+            f"Total missing (after): {autotab.failed_count}\n" + \
+            f"Total queries made: {autotab.request_count}\n" + \
+            f"Time taken: {time.strftime('%H:%M:%S', time.gmtime(time.time() - start))}\n" + \
+            f"Prediction per second: {autotab.num_missing / time_taken:.2f}\n" + \
+            f"Query per second: {autotab.request_count / time_taken:.2f}"
+    query_example = autotab.query_example if autotab.request_count > 0 else "No queries made."
+    return report, output_file_name, query_example, autotab.data[:15]
 # Gradio interface
         value="You are a helpful assistant. Help me finish the task.",
         label="Instruction",
     ),
+    gr.Slider(value=4, minimum=1, maximum=50, step=1, label="Max Examples"),
     gr.Textbox(value="Qwen/Qwen2-7B-Instruct", label="Model Name"),
     gr.Textbox(
         value='{"temperature": 0, "max_tokens": 128}',
     gr.Slider(value=0.1, minimum=0, maximum=10, label="Request Interval in Seconds"),
     gr.Slider(value=100, minimum=1, maximum=1000, step=1, label="Save Every N Steps"),
     gr.Textbox(
+        value="sk-exhahhjfqyanmwewndukcqtrpegfdbwszkjucvcpajdufiah",
+        label="API Key(s). One per line.",
     ),
     gr.Textbox(value="https://public-beta-api.siliconflow.cn/v1", label="Base URL"),
 ]
 outputs = [
+    gr.Textbox(label="Report"),
     gr.File(label="Output Excel File"),
     gr.Textbox(label="Query Example"),
     gr.Dataframe(label="First 15 rows."),
     inputs=inputs,
     outputs=outputs,
     title="Auto Tabulator Completion",
+    description="Automatically complete missing output values in tabular data based on in-context learning. Check https://github.com/Ki-Seki/autotab.",
 ).launch()

assets/demo.png ADDED Viewed

autotab.py CHANGED Viewed

@@ -19,7 +19,7 @@ class AutoTab:
         generation_config: dict,
         request_interval: float,
         save_every: int,
-        api_key: str,
         base_url: str,
     ):
         self.in_file_path = in_file_path
@@ -30,9 +30,17 @@ class AutoTab:
         self.generation_config = generation_config
         self.request_interval = request_interval
         self.save_every = save_every
-        self.api_key = api_key
         self.base_url = base_url
     # ─── IO ───────────────────────────────────────────────────────────────
     def load_excel(self) -> tuple[pd.DataFrame, list, list]:
@@ -47,8 +55,15 @@ class AutoTab:
     @retry(wait=wait_random_exponential(min=20, max=60), stop=stop_after_attempt(6))
     def openai_request(self, query: str) -> str:
         """Make a request to an OpenAI-format API."""
         time.sleep(self.request_interval)
-        client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
         response = client.chat.completions.create(
             model=self.model_name,
             messages=[{"role": "user", "content": query}],
@@ -59,61 +74,60 @@ class AutoTab:
     # ─── In-Context Learning ──────────────────────────────────────────────
-    def derive_incontext(
-        self, data: pd.DataFrame, input_columns: list[str], output_columns: list[str]
-    ) -> str:
         """Derive the in-context prompt with angle brackets."""
-        n = min(self.max_examples, len(data.dropna(subset=output_columns)))
         in_context = ""
-        for i in range(n):
             in_context += "".join(
-                f"<{col.replace('[Input] ', '')}>{data[col].iloc[i]}</{col.replace('[Input] ', '')}>\n"
-                for col in input_columns
             )
             in_context += "".join(
-                f"<{col.replace('[Output] ', '')}>{data[col].iloc[i]}</{col.replace('[Output] ', '')}>\n"
-                for col in output_columns
             )
             in_context += "\n"
         return in_context
-    def predict_output(
-        self, in_context: str, input_data: pd.DataFrame, input_fields: str
-    ):
         """Predict the output values for the given input data using the API."""
         query = (
             self.instruction
             + "\n\n"
-            + in_context
             + "".join(
                 f"<{col.replace('[Input] ', '')}>{input_data[col]}</{col.replace('[Input] ', '')}>\n"
-                for col in input_fields
             )
         )
         self.query_example = query
         output = self.openai_request(query)
         return output
-    def extract_fields(
-        self, response: str, output_columns: list[str]
-    ) -> dict[str, str]:
         """Extract fields from the response text based on output columns."""
         extracted = {}
-        for col in output_columns:
             field = col.replace("[Output] ", "")
             match = re.search(f"<{field}>(.*?)</{field}>", response)
             extracted[col] = match.group(1) if match else ""
         return extracted
     # ─── Engine ───────────────────────────────────────────────────────────
-    def _predict_and_extract(self, i: int) -> dict[str, str]:
         """Helper function to predict and extract fields for a single row."""
-        prediction = self.predict_output(
-            self.in_context, self.data.iloc[i], self.input_fields
-        )
-        extracted_fields = self.extract_fields(prediction, self.output_fields)
-        return extracted_fields
     def batch_prediction(self, start_index: int, end_index: int):
         """Process a batch of predictions asynchronously."""
@@ -126,16 +140,8 @@ class AutoTab:
                 self.data.at[i, field_name] = extracted_fields.get(field_name, "")
     def run(self):
-        self.data, self.input_fields, self.output_fields = self.load_excel()
-        self.in_context = self.derive_incontext(
-            self.data, self.input_fields, self.output_fields
-        )
-        self.num_data = len(self.data)
-        self.num_examples = len(self.data.dropna(subset=self.output_fields))
-        tqdm_bar = tqdm(total=self.num_data - self.num_examples, leave=False)
-        for start in range(self.num_examples, self.num_data, self.save_every):
             tqdm_bar.update(min(self.save_every, self.num_data - start))
             end = min(start + self.save_every, self.num_data)
             try:

         generation_config: dict,
         request_interval: float,
         save_every: int,
+        api_keys: list[str],
         base_url: str,
     ):
         self.in_file_path = in_file_path
         self.generation_config = generation_config
         self.request_interval = request_interval
         self.save_every = save_every
+        self.api_keys = api_keys
         self.base_url = base_url
+        self.request_count = 0
+        self.failed_count = 0
+        self.data, self.input_fields, self.output_fields = self.load_excel()
+        self.in_context = self.derive_incontext()
+        self.num_data = len(self.data)
+        self.num_example = len(self.data.dropna(subset=self.output_fields))
+        self.num_missing = self.num_data - self.num_example
     # ─── IO ───────────────────────────────────────────────────────────────
     def load_excel(self) -> tuple[pd.DataFrame, list, list]:
     @retry(wait=wait_random_exponential(min=20, max=60), stop=stop_after_attempt(6))
     def openai_request(self, query: str) -> str:
         """Make a request to an OpenAI-format API."""
+        # Wait for the request interval
         time.sleep(self.request_interval)
+        # Increment the request count
+        api_key = self.api_keys[self.request_count % len(self.api_keys)]
+        self.request_count += 1
+        client = openai.OpenAI(api_key=api_key, base_url=self.base_url)
         response = client.chat.completions.create(
             model=self.model_name,
             messages=[{"role": "user", "content": query}],
     # ─── In-Context Learning ──────────────────────────────────────────────
+    def derive_incontext(self) -> str:
         """Derive the in-context prompt with angle brackets."""
+        examples = self.data.dropna(subset=self.output_fields)[: self.max_examples]
         in_context = ""
+        for i in range(len(examples)):
             in_context += "".join(
+                f"<{col.replace('[Input] ', '')}>{self.data[col].iloc[i]}</{col.replace('[Input] ', '')}>\n"
+                for col in self.input_fields
             )
             in_context += "".join(
+                f"<{col.replace('[Output] ', '')}>{self.data[col].iloc[i]}</{col.replace('[Output] ', '')}>\n"
+                for col in self.output_fields
             )
             in_context += "\n"
         return in_context
+    def predict_output(self, input_data: pd.DataFrame):
         """Predict the output values for the given input data using the API."""
         query = (
             self.instruction
             + "\n\n"
+            + self.in_context
             + "".join(
                 f"<{col.replace('[Input] ', '')}>{input_data[col]}</{col.replace('[Input] ', '')}>\n"
+                for col in self.input_fields
             )
         )
         self.query_example = query
         output = self.openai_request(query)
         return output
+    def extract_fields(self, response: str) -> dict[str, str]:
         """Extract fields from the response text based on output columns."""
         extracted = {}
+        for col in self.output_fields:
             field = col.replace("[Output] ", "")
             match = re.search(f"<{field}>(.*?)</{field}>", response)
             extracted[col] = match.group(1) if match else ""
+        if any(extracted[col] == "" for col in self.output_fields):
+            self.failed_count += 1
         return extracted
     # ─── Engine ───────────────────────────────────────────────────────────
+    def _predict_and_extract(self, row: int) -> dict[str, str]:
         """Helper function to predict and extract fields for a single row."""
+        # If any output field is empty, predict the output
+        if any(pd.isnull(self.data.at[row, col]) for col in self.output_fields):
+            prediction = self.predict_output(self.data.iloc[row])
+            extracted_fields = self.extract_fields(prediction)
+            return extracted_fields
+        else:
+            return {col: self.data.at[row, col] for col in self.output_fields}
     def batch_prediction(self, start_index: int, end_index: int):
         """Process a batch of predictions asynchronously."""
                 self.data.at[i, field_name] = extracted_fields.get(field_name, "")
     def run(self):
+        tqdm_bar = tqdm(total=self.num_data, leave=False)
+        for start in range(0, self.num_data, self.save_every):
             tqdm_bar.update(min(self.save_every, self.num_data - start))
             end = min(start + self.save_every, self.num_data)
             try:

data/ch_patent_input.xlsx ADDED Viewed

Binary file (33 kB). View file

data/ch_patent_output.xlsx ADDED Viewed

Binary file (41.5 kB). View file

data/en_qa_input.xlsx ADDED Viewed

Binary file (6.77 kB). View file

data/en_qa_output.xlsx ADDED Viewed

Binary file (5.37 kB). View file

demo.ipynb ADDED Viewed

	@@ -0,0 +1,111 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                     "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Results saved to data/en_qa_output.xlsx\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r"
+     ]
+    }
+   ],
+   "source": [
+    "from autotab import AutoTab\n",
+    "\n",
+    "\n",
+    "autotab = AutoTab(\n",
+    "    in_file_path=\"data/en_qa_input.xlsx\",\n",
+    "    out_file_path=\"data/en_qa_output.xlsx\",\n",
+    "    instruction=\"You should help me classify the questions and answer them.\",\n",
+    "    max_examples=5,\n",
+    "    model_name=\"Qwen/Qwen2-7B-Instruct\",\n",
+    "    generation_config={\"temperature\": 0, \"max_tokens\": 128},\n",
+    "    request_interval=0.01,\n",
+    "    api_keys=[\"sk-exhahhjfqyanmwewndukcqtrpegfdbwszkjucvcpajdufiah\"],\n",
+    "    base_url=\"https://public-beta-api.siliconflow.cn/v1\",\n",
+    "    save_every=10,\n",
+    ")\n",
+    "autotab.run()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "You should help me classify the questions and answer them.\n",
+      "\n",
+      "<Question>What is the capital of France?</Question>\n",
+      "<Category>Geography</Category>\n",
+      "<Answer>Paris</Answer>\n",
+      "\n",
+      "<Question>Who wrote '1984'?</Question>\n",
+      "<Category>Literature</Category>\n",
+      "<Answer>George Orwell</Answer>\n",
+      "\n",
+      "<Question>What is the largest planet in the solar system?</Question>\n",
+      "<Category>Astronomy</Category>\n",
+      "<Answer>Jupiter</Answer>\n",
+      "\n",
+      "<Question>Who painted the Mona Lisa?</Question>\n",
+      "<Category>Art</Category>\n",
+      "<Answer>Leonardo da Vinci</Answer>\n",
+      "\n",
+      "<Question>What is the currency of Japan?</Question>\n",
+      "<Category>Economics</Category>\n",
+      "<Answer>Yen</Answer>\n",
+      "\n",
+      "<Question>Who is the first president of the United States?</Question>\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(autotab.query_example)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "common",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}