Spaces:

Alealejandrooo
/

MindBody_VS_Medserv

Running

App Files Files Community

Alealejandrooo commited on May 12

Commit

526fd5a

•

1 Parent(s): db67e90

updated tollerance

Browse files

Files changed (1) hide show

process.py +122 -120

process.py CHANGED Viewed

@@ -1,121 +1,123 @@
-import pandas as pd
-import gradio as gr
-import re
-from datetime import timedelta
-def process_data(files_mindbody, files_medserv, tollerance, progress=gr.Progress()):
-    mindbody = load_data(files_mindbody)
-    medserv = load_data(files_medserv)
-    # Split 'Client' names into first name and last name components for both DataFrames
-    medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True)
-    mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True)
-    # Initialize an empty list to store unmatched rows
-    unmatched_rows = []
-    rows = len(mindbody)
-    # Iterate through each row in the mindbody DataFrame
-    for idx in progress.tqdm(range(rows), desc='Analyzing files...'):
-        # Extract relevant information from the current row
-        date = mindbody.iloc[idx]['DOS']
-        first_name = mindbody.iloc[idx]['First Name']
-        last_name = mindbody.iloc[idx]['Last Name']
-        # Define the range of dates to search for a match in medserv
-        date_range = [date - timedelta(days= tollerance), date, date + timedelta(days=tollerance)]
-        # Filter medserv based on the date range and name criteria
-        matches = medserv[((medserv['DOS'].isin(date_range)) &
-                        ((medserv['First Name'] == first_name) |
-                            (medserv['Last Name'] == last_name)))]
-        # If no match is found, append the row to the unmatched_rows list
-        if matches.empty:
-            unmatched_rows.append(mindbody.iloc[idx])
-    # Create a DataFrame from the unmatched_rows list
-    unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns)
-    # Specify the columns to include in the output Excel file
-    columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location']
-    # Format the 'DOS' column to remove time part
-    unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y')
-    output_file_path = 'Comparison Results.xlsx'
-    unmatched_df[columns_to_include].to_excel(output_file_path, index=False)
-    return output_file_path
-def load_data(files):
-    # Check if a single file or multiple files are provided
-    filepaths = [file.name for file in files]
-    # Load and concatenate multiple files if provided
-    dfs = []
-    for filepath in filepaths:
-        if filepath.endswith('.xlsx') or filepath.endswith('.xls'):
-            dfs.append(pd.read_excel(filepath))
-        else:
-            raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file")
-    # Concatenate dataframes if more than one file is provided
-    if len(dfs) > 1:
-        df = pd.concat(dfs, ignore_index=True)
-    else:
-        df = dfs[0]
-    # Find and rename the date column to 'DOS'
-    date_column = find_date_column(df)
-    if date_column:
-        df.rename(columns={date_column: 'DOS'}, inplace=True)
-    # Find and rename the name column to 'Client'
-    name_column = find_name_column(df)
-    if name_column:
-        df.rename(columns={name_column: 'Client'}, inplace=True)
-    return df
-def find_name_column(df):
-    name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$"  # Regex pattern for last name, first name(s)
-    max_count = 0
-    name_column = None
-    for column in df.columns:
-        # Count matches of the name pattern in each column
-        matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x)))
-        valid_count = matches.sum()  # Sum of True values indicating valid names
-        # Select the column with the maximum count of valid names
-        if valid_count > max_count:
-            max_count = valid_count
-            name_column = column
-    return name_column
-def find_date_column(df):
-    date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b"  # Regex pattern for common date formats
-    max_count = 0
-    date_column = None
-    for column in df.columns:
-        # Count matches of the date pattern in each column
-        matches = df[column].astype(str).str.contains(date_pattern, na=False)
-        valid_count = matches.sum()  # Sum of True values indicating valid dates
-        # Select the column with the maximum count of valid dates
-        if valid_count > max_count:
-            max_count = valid_count
-            date_column = column
     return date_column

+import pandas as pd
+import gradio as gr
+import re
+from datetime import timedelta
+def process_data(files_mindbody, files_medserv, tollerance, progress=gr.Progress()):
+    mindbody = load_data(files_mindbody)
+    medserv = load_data(files_medserv)
+    # Split 'Client' names into first name and last name components for both DataFrames
+    medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True)
+    mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True)
+    # Initialize an empty list to store unmatched rows
+    unmatched_rows = []
+    rows = len(mindbody)
+    # Iterate through each row in the mindbody DataFrame
+    for idx in progress.tqdm(range(rows), desc='Analyzing files...'):
+        # Extract relevant information from the current row
+        date = mindbody.iloc[idx]['DOS']
+        first_name = mindbody.iloc[idx]['First Name']
+        last_name = mindbody.iloc[idx]['Last Name']
+        # Define the range of dates to search for a match in medserv
+        date_range = [date - timedelta(days=i) for i in range(tollerance, -tollerance-1, -1)]
+        # Remove the time component from the dates in date_range
+        date_range = [d.date() for d in date_range]
+        # Filter medserv based on the date range and name criteria
+        matches = medserv[((medserv['DOS'].dt.date.isin(date_range)) &
+                        ((medserv['First Name'] == first_name) |
+                            (medserv['Last Name'] == last_name)))]
+        # If no match is found, append the row to the unmatched_rows list
+        if matches.empty:
+            unmatched_rows.append(mindbody.iloc[idx])
+    # Create a DataFrame from the unmatched_rows list
+    unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns)
+    # Specify the columns to include in the output Excel file
+    columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location']
+    # Format the 'DOS' column to remove time part
+    unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y')
+    output_file_path = 'Comparison Results.xlsx'
+    unmatched_df[columns_to_include].to_excel(output_file_path, index=False)
+    return output_file_path
+def load_data(files):
+    # Check if a single file or multiple files are provided
+    filepaths = [file.name for file in files]
+    # Load and concatenate multiple files if provided
+    dfs = []
+    for filepath in filepaths:
+        if filepath.endswith('.xlsx') or filepath.endswith('.xls'):
+            dfs.append(pd.read_excel(filepath))
+        else:
+            raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file")
+    # Concatenate dataframes if more than one file is provided
+    if len(dfs) > 1:
+        df = pd.concat(dfs, ignore_index=True)
+    else:
+        df = dfs[0]
+    # Find and rename the date column to 'DOS'
+    date_column = find_date_column(df)
+    if date_column:
+        df.rename(columns={date_column: 'DOS'}, inplace=True)
+    # Find and rename the name column to 'Client'
+    name_column = find_name_column(df)
+    if name_column:
+        df.rename(columns={name_column: 'Client'}, inplace=True)
+    return df
+def find_name_column(df):
+    name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$"  # Regex pattern for last name, first name(s)
+    max_count = 0
+    name_column = None
+    for column in df.columns:
+        # Count matches of the name pattern in each column
+        matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x)))
+        valid_count = matches.sum()  # Sum of True values indicating valid names
+        # Select the column with the maximum count of valid names
+        if valid_count > max_count:
+            max_count = valid_count
+            name_column = column
+    return name_column
+def find_date_column(df):
+    date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b"  # Regex pattern for common date formats
+    max_count = 0
+    date_column = None
+    for column in df.columns:
+        # Count matches of the date pattern in each column
+        matches = df[column].astype(str).str.contains(date_pattern, na=False)
+        valid_count = matches.sum()  # Sum of True values indicating valid dates
+        # Select the column with the maximum count of valid dates
+        if valid_count > max_count:
+            max_count = valid_count
+            date_column = column
     return date_column