Alealejandrooo commited on
Commit
526fd5a
1 Parent(s): db67e90

updated tollerance

Browse files
Files changed (1) hide show
  1. process.py +122 -120
process.py CHANGED
@@ -1,121 +1,123 @@
1
- import pandas as pd
2
- import gradio as gr
3
- import re
4
- from datetime import timedelta
5
-
6
-
7
- def process_data(files_mindbody, files_medserv, tollerance, progress=gr.Progress()):
8
-
9
- mindbody = load_data(files_mindbody)
10
- medserv = load_data(files_medserv)
11
-
12
- # Split 'Client' names into first name and last name components for both DataFrames
13
- medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True)
14
- mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True)
15
- # Initialize an empty list to store unmatched rows
16
- unmatched_rows = []
17
-
18
- rows = len(mindbody)
19
-
20
- # Iterate through each row in the mindbody DataFrame
21
- for idx in progress.tqdm(range(rows), desc='Analyzing files...'):
22
- # Extract relevant information from the current row
23
- date = mindbody.iloc[idx]['DOS']
24
- first_name = mindbody.iloc[idx]['First Name']
25
- last_name = mindbody.iloc[idx]['Last Name']
26
-
27
- # Define the range of dates to search for a match in medserv
28
- date_range = [date - timedelta(days= tollerance), date, date + timedelta(days=tollerance)]
29
-
30
- # Filter medserv based on the date range and name criteria
31
- matches = medserv[((medserv['DOS'].isin(date_range)) &
32
- ((medserv['First Name'] == first_name) |
33
- (medserv['Last Name'] == last_name)))]
34
-
35
- # If no match is found, append the row to the unmatched_rows list
36
- if matches.empty:
37
- unmatched_rows.append(mindbody.iloc[idx])
38
-
39
- # Create a DataFrame from the unmatched_rows list
40
- unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns)
41
-
42
- # Specify the columns to include in the output Excel file
43
- columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location']
44
-
45
- # Format the 'DOS' column to remove time part
46
- unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y')
47
-
48
- output_file_path = 'Comparison Results.xlsx'
49
- unmatched_df[columns_to_include].to_excel(output_file_path, index=False)
50
-
51
- return output_file_path
52
-
53
-
54
-
55
- def load_data(files):
56
- # Check if a single file or multiple files are provided
57
- filepaths = [file.name for file in files]
58
-
59
- # Load and concatenate multiple files if provided
60
- dfs = []
61
- for filepath in filepaths:
62
- if filepath.endswith('.xlsx') or filepath.endswith('.xls'):
63
- dfs.append(pd.read_excel(filepath))
64
- else:
65
- raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file")
66
-
67
- # Concatenate dataframes if more than one file is provided
68
- if len(dfs) > 1:
69
- df = pd.concat(dfs, ignore_index=True)
70
- else:
71
- df = dfs[0]
72
-
73
- # Find and rename the date column to 'DOS'
74
- date_column = find_date_column(df)
75
- if date_column:
76
- df.rename(columns={date_column: 'DOS'}, inplace=True)
77
-
78
- # Find and rename the name column to 'Client'
79
- name_column = find_name_column(df)
80
- if name_column:
81
- df.rename(columns={name_column: 'Client'}, inplace=True)
82
-
83
- return df
84
-
85
-
86
- def find_name_column(df):
87
- name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$" # Regex pattern for last name, first name(s)
88
-
89
- max_count = 0
90
- name_column = None
91
-
92
- for column in df.columns:
93
- # Count matches of the name pattern in each column
94
- matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x)))
95
- valid_count = matches.sum() # Sum of True values indicating valid names
96
-
97
- # Select the column with the maximum count of valid names
98
- if valid_count > max_count:
99
- max_count = valid_count
100
- name_column = column
101
-
102
- return name_column
103
-
104
-
105
- def find_date_column(df):
106
- date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b" # Regex pattern for common date formats
107
-
108
- max_count = 0
109
- date_column = None
110
-
111
- for column in df.columns:
112
- # Count matches of the date pattern in each column
113
- matches = df[column].astype(str).str.contains(date_pattern, na=False)
114
- valid_count = matches.sum() # Sum of True values indicating valid dates
115
-
116
- # Select the column with the maximum count of valid dates
117
- if valid_count > max_count:
118
- max_count = valid_count
119
- date_column = column
120
-
 
 
121
  return date_column
 
1
+ import pandas as pd
2
+ import gradio as gr
3
+ import re
4
+ from datetime import timedelta
5
+
6
+
7
+ def process_data(files_mindbody, files_medserv, tollerance, progress=gr.Progress()):
8
+
9
+ mindbody = load_data(files_mindbody)
10
+ medserv = load_data(files_medserv)
11
+
12
+ # Split 'Client' names into first name and last name components for both DataFrames
13
+ medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True)
14
+ mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True)
15
+ # Initialize an empty list to store unmatched rows
16
+ unmatched_rows = []
17
+
18
+ rows = len(mindbody)
19
+
20
+ # Iterate through each row in the mindbody DataFrame
21
+ for idx in progress.tqdm(range(rows), desc='Analyzing files...'):
22
+ # Extract relevant information from the current row
23
+ date = mindbody.iloc[idx]['DOS']
24
+ first_name = mindbody.iloc[idx]['First Name']
25
+ last_name = mindbody.iloc[idx]['Last Name']
26
+
27
+ # Define the range of dates to search for a match in medserv
28
+ date_range = [date - timedelta(days=i) for i in range(tollerance, -tollerance-1, -1)]
29
+ # Remove the time component from the dates in date_range
30
+ date_range = [d.date() for d in date_range]
31
+
32
+ # Filter medserv based on the date range and name criteria
33
+ matches = medserv[((medserv['DOS'].dt.date.isin(date_range)) &
34
+ ((medserv['First Name'] == first_name) |
35
+ (medserv['Last Name'] == last_name)))]
36
+
37
+ # If no match is found, append the row to the unmatched_rows list
38
+ if matches.empty:
39
+ unmatched_rows.append(mindbody.iloc[idx])
40
+
41
+ # Create a DataFrame from the unmatched_rows list
42
+ unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns)
43
+
44
+ # Specify the columns to include in the output Excel file
45
+ columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location']
46
+
47
+ # Format the 'DOS' column to remove time part
48
+ unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y')
49
+
50
+ output_file_path = 'Comparison Results.xlsx'
51
+ unmatched_df[columns_to_include].to_excel(output_file_path, index=False)
52
+
53
+ return output_file_path
54
+
55
+
56
+
57
+ def load_data(files):
58
+ # Check if a single file or multiple files are provided
59
+ filepaths = [file.name for file in files]
60
+
61
+ # Load and concatenate multiple files if provided
62
+ dfs = []
63
+ for filepath in filepaths:
64
+ if filepath.endswith('.xlsx') or filepath.endswith('.xls'):
65
+ dfs.append(pd.read_excel(filepath))
66
+ else:
67
+ raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file")
68
+
69
+ # Concatenate dataframes if more than one file is provided
70
+ if len(dfs) > 1:
71
+ df = pd.concat(dfs, ignore_index=True)
72
+ else:
73
+ df = dfs[0]
74
+
75
+ # Find and rename the date column to 'DOS'
76
+ date_column = find_date_column(df)
77
+ if date_column:
78
+ df.rename(columns={date_column: 'DOS'}, inplace=True)
79
+
80
+ # Find and rename the name column to 'Client'
81
+ name_column = find_name_column(df)
82
+ if name_column:
83
+ df.rename(columns={name_column: 'Client'}, inplace=True)
84
+
85
+ return df
86
+
87
+
88
+ def find_name_column(df):
89
+ name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$" # Regex pattern for last name, first name(s)
90
+
91
+ max_count = 0
92
+ name_column = None
93
+
94
+ for column in df.columns:
95
+ # Count matches of the name pattern in each column
96
+ matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x)))
97
+ valid_count = matches.sum() # Sum of True values indicating valid names
98
+
99
+ # Select the column with the maximum count of valid names
100
+ if valid_count > max_count:
101
+ max_count = valid_count
102
+ name_column = column
103
+
104
+ return name_column
105
+
106
+
107
+ def find_date_column(df):
108
+ date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b" # Regex pattern for common date formats
109
+
110
+ max_count = 0
111
+ date_column = None
112
+
113
+ for column in df.columns:
114
+ # Count matches of the date pattern in each column
115
+ matches = df[column].astype(str).str.contains(date_pattern, na=False)
116
+ valid_count = matches.sum() # Sum of True values indicating valid dates
117
+
118
+ # Select the column with the maximum count of valid dates
119
+ if valid_count > max_count:
120
+ max_count = valid_count
121
+ date_column = column
122
+
123
  return date_column