Alealejandrooo
commited on
Commit
•
526fd5a
1
Parent(s):
db67e90
updated tollerance
Browse files- process.py +122 -120
process.py
CHANGED
@@ -1,121 +1,123 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import gradio as gr
|
3 |
-
import re
|
4 |
-
from datetime import timedelta
|
5 |
-
|
6 |
-
|
7 |
-
def process_data(files_mindbody, files_medserv, tollerance, progress=gr.Progress()):
|
8 |
-
|
9 |
-
mindbody = load_data(files_mindbody)
|
10 |
-
medserv = load_data(files_medserv)
|
11 |
-
|
12 |
-
# Split 'Client' names into first name and last name components for both DataFrames
|
13 |
-
medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True)
|
14 |
-
mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True)
|
15 |
-
# Initialize an empty list to store unmatched rows
|
16 |
-
unmatched_rows = []
|
17 |
-
|
18 |
-
rows = len(mindbody)
|
19 |
-
|
20 |
-
# Iterate through each row in the mindbody DataFrame
|
21 |
-
for idx in progress.tqdm(range(rows), desc='Analyzing files...'):
|
22 |
-
# Extract relevant information from the current row
|
23 |
-
date = mindbody.iloc[idx]['DOS']
|
24 |
-
first_name = mindbody.iloc[idx]['First Name']
|
25 |
-
last_name = mindbody.iloc[idx]['Last Name']
|
26 |
-
|
27 |
-
# Define the range of dates to search for a match in medserv
|
28 |
-
date_range = [date - timedelta(days=
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
df = dfs
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
|
|
|
|
121 |
return date_column
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import gradio as gr
|
3 |
+
import re
|
4 |
+
from datetime import timedelta
|
5 |
+
|
6 |
+
|
7 |
+
def process_data(files_mindbody, files_medserv, tollerance, progress=gr.Progress()):
|
8 |
+
|
9 |
+
mindbody = load_data(files_mindbody)
|
10 |
+
medserv = load_data(files_medserv)
|
11 |
+
|
12 |
+
# Split 'Client' names into first name and last name components for both DataFrames
|
13 |
+
medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True)
|
14 |
+
mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True)
|
15 |
+
# Initialize an empty list to store unmatched rows
|
16 |
+
unmatched_rows = []
|
17 |
+
|
18 |
+
rows = len(mindbody)
|
19 |
+
|
20 |
+
# Iterate through each row in the mindbody DataFrame
|
21 |
+
for idx in progress.tqdm(range(rows), desc='Analyzing files...'):
|
22 |
+
# Extract relevant information from the current row
|
23 |
+
date = mindbody.iloc[idx]['DOS']
|
24 |
+
first_name = mindbody.iloc[idx]['First Name']
|
25 |
+
last_name = mindbody.iloc[idx]['Last Name']
|
26 |
+
|
27 |
+
# Define the range of dates to search for a match in medserv
|
28 |
+
date_range = [date - timedelta(days=i) for i in range(tollerance, -tollerance-1, -1)]
|
29 |
+
# Remove the time component from the dates in date_range
|
30 |
+
date_range = [d.date() for d in date_range]
|
31 |
+
|
32 |
+
# Filter medserv based on the date range and name criteria
|
33 |
+
matches = medserv[((medserv['DOS'].dt.date.isin(date_range)) &
|
34 |
+
((medserv['First Name'] == first_name) |
|
35 |
+
(medserv['Last Name'] == last_name)))]
|
36 |
+
|
37 |
+
# If no match is found, append the row to the unmatched_rows list
|
38 |
+
if matches.empty:
|
39 |
+
unmatched_rows.append(mindbody.iloc[idx])
|
40 |
+
|
41 |
+
# Create a DataFrame from the unmatched_rows list
|
42 |
+
unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns)
|
43 |
+
|
44 |
+
# Specify the columns to include in the output Excel file
|
45 |
+
columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location']
|
46 |
+
|
47 |
+
# Format the 'DOS' column to remove time part
|
48 |
+
unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y')
|
49 |
+
|
50 |
+
output_file_path = 'Comparison Results.xlsx'
|
51 |
+
unmatched_df[columns_to_include].to_excel(output_file_path, index=False)
|
52 |
+
|
53 |
+
return output_file_path
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
def load_data(files):
|
58 |
+
# Check if a single file or multiple files are provided
|
59 |
+
filepaths = [file.name for file in files]
|
60 |
+
|
61 |
+
# Load and concatenate multiple files if provided
|
62 |
+
dfs = []
|
63 |
+
for filepath in filepaths:
|
64 |
+
if filepath.endswith('.xlsx') or filepath.endswith('.xls'):
|
65 |
+
dfs.append(pd.read_excel(filepath))
|
66 |
+
else:
|
67 |
+
raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file")
|
68 |
+
|
69 |
+
# Concatenate dataframes if more than one file is provided
|
70 |
+
if len(dfs) > 1:
|
71 |
+
df = pd.concat(dfs, ignore_index=True)
|
72 |
+
else:
|
73 |
+
df = dfs[0]
|
74 |
+
|
75 |
+
# Find and rename the date column to 'DOS'
|
76 |
+
date_column = find_date_column(df)
|
77 |
+
if date_column:
|
78 |
+
df.rename(columns={date_column: 'DOS'}, inplace=True)
|
79 |
+
|
80 |
+
# Find and rename the name column to 'Client'
|
81 |
+
name_column = find_name_column(df)
|
82 |
+
if name_column:
|
83 |
+
df.rename(columns={name_column: 'Client'}, inplace=True)
|
84 |
+
|
85 |
+
return df
|
86 |
+
|
87 |
+
|
88 |
+
def find_name_column(df):
|
89 |
+
name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$" # Regex pattern for last name, first name(s)
|
90 |
+
|
91 |
+
max_count = 0
|
92 |
+
name_column = None
|
93 |
+
|
94 |
+
for column in df.columns:
|
95 |
+
# Count matches of the name pattern in each column
|
96 |
+
matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x)))
|
97 |
+
valid_count = matches.sum() # Sum of True values indicating valid names
|
98 |
+
|
99 |
+
# Select the column with the maximum count of valid names
|
100 |
+
if valid_count > max_count:
|
101 |
+
max_count = valid_count
|
102 |
+
name_column = column
|
103 |
+
|
104 |
+
return name_column
|
105 |
+
|
106 |
+
|
107 |
+
def find_date_column(df):
|
108 |
+
date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b" # Regex pattern for common date formats
|
109 |
+
|
110 |
+
max_count = 0
|
111 |
+
date_column = None
|
112 |
+
|
113 |
+
for column in df.columns:
|
114 |
+
# Count matches of the date pattern in each column
|
115 |
+
matches = df[column].astype(str).str.contains(date_pattern, na=False)
|
116 |
+
valid_count = matches.sum() # Sum of True values indicating valid dates
|
117 |
+
|
118 |
+
# Select the column with the maximum count of valid dates
|
119 |
+
if valid_count > max_count:
|
120 |
+
max_count = valid_count
|
121 |
+
date_column = column
|
122 |
+
|
123 |
return date_column
|