fschwartzer commited on
Commit
905b603
1 Parent(s): 40ac373

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -14
app.py CHANGED
@@ -22,25 +22,28 @@ def fetch_data_to_dataframe(query, limit=50, source="mercadolibre"):
22
  return df
23
  return pd.DataFrame()
24
 
25
- def refinar_resultados(df, exclude_word="conjunto", include_word=False):
26
  df['Title'] = df['Title'].astype(str)
27
- df_refinado = df[~df['Title'].str.contains("kit", case=False, na=False)]
28
- df_refinado = df[~df['Title'].str.contains("Kit", case=False, na=False)]
29
- df_refinado = df[~df['Title'].str.contains("conj", case=False, na=False)]
30
- df_refinado = df[~df['Title'].str.contains(" e ", case=False, na=False)]
31
- padrao_unidades = r'\b(\d+)\s*(unidade|unidades|pacote|pacotes|caixa|caixas)\b'
32
- df_refinado = df_refinado[~df_refinado['Title'].str.contains(padrao_unidades, case=False, regex=True)]
33
-
34
- if not include_word:
35
- # Exclude results containing "conjunto" if it's not part of the original query
36
- df_refinado = df_refinado[~df_refinado['Title'].str.contains(exclude_word, case=False)]
37
-
 
 
38
  return df_refinado
39
 
 
40
  def get_best_match(query, choices, limit=50):
41
  # Using RapidFuzz for improved performance and fuzzy matching
42
  matches = process.extract(query, choices, scorer=fuzz.WRatio, limit=limit)
43
- return [match[0] for match in matches if match[1] > 70]
44
 
45
  def match_query_words_in_titles(query, title):
46
  """
@@ -85,7 +88,7 @@ def calcular_fator_avaliacao(titulo, EC, PU):
85
  def select_nearest_items(df, query):
86
  # Lower the title similarity threshold if necessary
87
  df['Title_Similarity'] = df['Title'].apply(lambda x: fuzz.WRatio(query, x))
88
- df_filtered = df[df['Title_Similarity'] > 70] # Adjusted threshold
89
 
90
  # Calculate mode price in a more inclusive manner
91
  mode_price = df_filtered['Price'].mode()
 
22
  return df
23
  return pd.DataFrame()
24
 
25
+ def refinar_resultados(df, include_words=[]):
26
  df['Title'] = df['Title'].astype(str)
27
+
28
+ # Define a list of keywords to exclude, indicating multiples
29
+ exclude_keywords = ["kit", "conjunto", "pacote", "caixa", "unidades"]
30
+
31
+ # Add conditional exclusion for words not included in the query
32
+ exclude_patterns = [keyword for keyword in exclude_keywords if keyword not in include_words]
33
+
34
+ # Combine all exclude patterns into a single regex pattern
35
+ exclude_pattern = r'\b(' + '|'.join(exclude_patterns) + r')\b|\b(\d+)\s*(unidade|pacotes|caixas)\b'
36
+
37
+ # Perform the filtering in one operation
38
+ df_refinado = df[~df['Title'].str.contains(exclude_pattern, case=False, regex=True, na=False)]
39
+
40
  return df_refinado
41
 
42
+
43
  def get_best_match(query, choices, limit=50):
44
  # Using RapidFuzz for improved performance and fuzzy matching
45
  matches = process.extract(query, choices, scorer=fuzz.WRatio, limit=limit)
46
+ return [match[0] for match in matches if match[1] > 65]
47
 
48
  def match_query_words_in_titles(query, title):
49
  """
 
88
  def select_nearest_items(df, query):
89
  # Lower the title similarity threshold if necessary
90
  df['Title_Similarity'] = df['Title'].apply(lambda x: fuzz.WRatio(query, x))
91
+ df_filtered = df[df['Title_Similarity'] > 65] # Adjusted threshold
92
 
93
  # Calculate mode price in a more inclusive manner
94
  mode_price = df_filtered['Price'].mode()