Spaces:

sussahoo
/

table_extraction

Build error

sussahoo commited on Dec 14, 2022

Commit

a310b30

1 Parent(s): 09cab40

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -369,9 +369,35 @@ def create_dataframe(cells_pytess_result: list, max_cols: int, max_rows: int, cs
     return df
 def process_image(image):
-    TD_THRESHOLD = 0.9
     TSR_THRESHOLD = 0.8
     padd_top = 100
     padd_left = 100
@@ -424,10 +450,8 @@ def process_image(image):
         csv_path = "/content/sample_data/table_" + str(idx)
         df = create_dataframe(sequential_cell_img_list, max_cols, max_rows, csv_path)
         result.append(df)
-    res = result[0].rename(columns={'Item': 'name', 'Total Cost': 'amount'})[["name", "amount"]]
-    res["cost Code"] = ""
-    res = {"items": res.to_json(orient='records')}
-    return res
 title = "Interactive demo OCR: microsoft - table-transformer-detection + tesseract"

     return df
+def postprocess_dataframes(result_tables):
+    """
+    Normalize column names
+    """
+    # df.columns = [col.replace('|', '') for col in df.columns]
+    res = {}
+    for idx, table_df in enumerate(result):
+        result_df = pd.DataFrame()
+        for col in table_df.columns:
+            if col.lower().startswith("item"):
+                result_df["name"] = table_df[col].copy()
+            if (
+                col.lower().startswith("total")
+                or col.lower().startswith("amount")
+                or col.lower().startswith("cost")
+            ):
+                result_df["amount"] = table_df[col].copy()
+        print(result_df.columns)
+        if len(result_df.columns) == 0:
+            result_df["name"] = table_df.iloc[:, 0].copy()
+            result_df["amount"] = table_df.iloc[:, 1].copy()
+        result_df["cost_code"] = ""
+        res["Table1" + str(idx)] = result_df.to_json(orient="records")
+    return res
 def process_image(image):
+    TD_THRESHOLD = 0.7
     TSR_THRESHOLD = 0.8
     padd_top = 100
     padd_left = 100
         csv_path = "/content/sample_data/table_" + str(idx)
         df = create_dataframe(sequential_cell_img_list, max_cols, max_rows, csv_path)
         result.append(df)
+    output = postprocess_dataframes(result)
+    return output
 title = "Interactive demo OCR: microsoft - table-transformer-detection + tesseract"