orionweller commited on
Commit
0bc0c39
·
1 Parent(s): 7e588ed

current updates

Browse files
Files changed (3) hide show
  1. app.py +64 -6
  2. find_splitting_words.py +11 -3
  3. requirements.txt +2 -1
app.py CHANGED
@@ -5,6 +5,8 @@ import pandas as pd
5
  from collections import defaultdict
6
  import json
7
  import copy
 
 
8
  import plotly.express as px
9
  from find_splitting_words import find_dividing_words
10
 
@@ -49,7 +51,39 @@ def get_current_data():
49
  # return the data as a CSV pandas
50
  return convert_df(pd.DataFrame(cur_query_data))
51
 
 
 
 
 
 
 
 
 
 
 
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
 
55
  if 'cur_instance_num' not in st.session_state:
@@ -94,7 +128,7 @@ with st.sidebar:
94
 
95
  z = st.header("Analysis Options")
96
  # sliderbar of how many Top N to choose
97
- n_relevant_docs = st.slider("Number of relevant docs", 1, 999, 20)
98
 
99
 
100
  col1, col2 = st.columns([1, 3], gap="large")
@@ -169,15 +203,39 @@ if corpus is not None and queries is not None and qrels is not None:
169
  # relevant
170
  relevant_docs = list(qrels[str(inst_num)].keys())[:n_relevant_docs]
171
  doc_texts = [(doc_id, corpus[doc_id]["title"] if "title" in corpus[doc_id] else "", corpus[doc_id]["text"]) for doc_id in relevant_docs]
172
- splitting_words = find_dividing_words(relevant_docs_text)
173
 
174
- st.markdown(splitting_words)
 
 
 
 
175
 
176
- container.subheader(f"Relevant Documents ({len(list(qrels[str(inst_num)].keys()))})")
177
  current_checkboxes = []
178
- for (docid, title, text) in doc_texts:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  current_checkboxes.append((docid, container.checkbox(f'{docid} is Non-Relevant', key=docid)))
180
- container.text_area(f"{docid}:", text)
181
 
182
 
183
  container.divider()
 
5
  from collections import defaultdict
6
  import json
7
  import copy
8
+ import re
9
+ import tqdm
10
  import plotly.express as px
11
  from find_splitting_words import find_dividing_words
12
 
 
51
  # return the data as a CSV pandas
52
  return convert_df(pd.DataFrame(cur_query_data))
53
 
54
+ @st.cache_data
55
+ def escape_markdown(text):
56
+ # List of characters to escape
57
+ # Adding backslash to the list of special characters to escape itself as well
58
+ text = text.replace("``", "\"")
59
+ special_chars = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!', '|', "$"]
60
+
61
+ # Escaping each special character
62
+ escaped_text = "".join(f"\\{char}" if char in special_chars else char for char in text)
63
+
64
+ return escaped_text
65
 
66
+ @st.cache_data
67
+ def highlight_text(text, splitting_words):
68
+ # remove anything that will mess up markdown
69
+ text = escape_markdown(text)
70
+ changed = False
71
+ if not len(splitting_words):
72
+ return text, changed
73
+
74
+ def replace_function(match):
75
+ return f'<span style="background-color: #FFFF00">{match.group(0)}</span>'
76
+
77
+ # Compile a single regular expression pattern for all splitting words
78
+ pattern = '|'.join([re.escape(word) for word in splitting_words])
79
+
80
+ # Perform case-insensitive replacement
81
+ new_text, num_subs = re.subn(pattern, replace_function, text, flags=re.IGNORECASE)
82
+
83
+ if num_subs > 0:
84
+ changed = True
85
+
86
+ return new_text, changed
87
 
88
 
89
  if 'cur_instance_num' not in st.session_state:
 
128
 
129
  z = st.header("Analysis Options")
130
  # sliderbar of how many Top N to choose
131
+ n_relevant_docs = st.slider("Number of relevant docs", 1, 999, 300)
132
 
133
 
134
  col1, col2 = st.columns([1, 3], gap="large")
 
203
  # relevant
204
  relevant_docs = list(qrels[str(inst_num)].keys())[:n_relevant_docs]
205
  doc_texts = [(doc_id, corpus[doc_id]["title"] if "title" in corpus[doc_id] else "", corpus[doc_id]["text"]) for doc_id in relevant_docs]
206
+ splitting_words = find_dividing_words([item[1] + " " + item[2] for item in doc_texts])
207
 
208
+ # make a selectbox of these splitting words (allow multiple)
209
+ container.subheader("Splitting Words")
210
+ container.text("Select words that are relevant to the query")
211
+ splitting_word_select = container.multiselect("Splitting Words", splitting_words, key="splitting_words")
212
+ container.divider()
213
 
 
214
  current_checkboxes = []
215
+ total_changed = 0
216
+ highlighted_texts = []
217
+ highlighted_titles = []
218
+ for (docid, title, text) in tqdm.tqdm(doc_texts):
219
+ if not len(splitting_word_select):
220
+ highlighted_texts.append(text)
221
+ highlighted_titles.append(title)
222
+ continue
223
+ highlighted_text, changed_text = highlight_text(text, splitting_word_select)
224
+ highlighted_title, changed_title = highlight_text(title, splitting_word_select)
225
+ highlighted_titles.append(highlighted_title)
226
+ highlighted_texts.append(highlighted_text)
227
+ total_changed += int(int(changed_text) or int(changed_title))
228
+
229
+ container.subheader(f"Relevant Documents ({len(list(qrels[str(inst_num)].keys()))})")
230
+ container.subheader(f"Total have these words: {total_changed}")
231
+
232
+ container.divider()
233
+
234
+ for i, (docid, title, text) in enumerate(doc_texts):
235
+ container.markdown(f"## {docid}")
236
+ container.markdown(f"#### {highlighted_titles[i]}", True)
237
+ container.markdown(f"\n{highlighted_texts[i]}", True)
238
  current_checkboxes.append((docid, container.checkbox(f'{docid} is Non-Relevant', key=docid)))
 
239
 
240
 
241
  container.divider()
find_splitting_words.py CHANGED
@@ -6,11 +6,14 @@ from nltk.tokenize import word_tokenize
6
  from collections import Counter
7
  import string
8
  import os
 
9
 
10
  # Ensure you've downloaded the set of stop words the first time you run this
11
  import nltk
12
- nltk.download('punkt')
13
- nltk.download('stopwords')
 
 
14
 
15
  def preprocess_document(doc):
16
  """
@@ -30,6 +33,7 @@ def preprocess_document(doc):
30
  stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
31
  return stemmed_tokens
32
 
 
33
  def find_dividing_words(documents):
34
  """
35
  Identifies candidate words that might split the set of documents into two groups.
@@ -37,10 +41,13 @@ def find_dividing_words(documents):
37
  all_words = []
38
  per_doc_word_counts = []
39
 
 
40
  for doc in documents:
 
41
  preprocessed_doc = preprocess_document(doc)
42
  all_words.extend(preprocessed_doc)
43
  per_doc_word_counts.append(Counter(preprocessed_doc))
 
44
 
45
  # Overall word frequency
46
  overall_word_counts = Counter(all_words)
@@ -50,8 +57,9 @@ def find_dividing_words(documents):
50
  candidate_words = []
51
  for word, count in overall_word_counts.items():
52
  doc_frequency = sum(1 for doc_count in per_doc_word_counts if doc_count[word] > 0)
53
- if 0.3 * num_docs <= doc_frequency <= 0.7 * num_docs:
54
  candidate_words.append(word)
 
55
 
56
  return candidate_words
57
 
 
6
  from collections import Counter
7
  import string
8
  import os
9
+ import streamlit as st
10
 
11
  # Ensure you've downloaded the set of stop words the first time you run this
12
  import nltk
13
+ # only download if they don't exist
14
+ if not os.path.exists(os.path.join(nltk.data.find('corpora'), 'stopwords')):
15
+ nltk.download('punkt')
16
+ nltk.download('stopwords')
17
 
18
  def preprocess_document(doc):
19
  """
 
33
  stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
34
  return stemmed_tokens
35
 
36
+ @st.cache_data
37
  def find_dividing_words(documents):
38
  """
39
  Identifies candidate words that might split the set of documents into two groups.
 
41
  all_words = []
42
  per_doc_word_counts = []
43
 
44
+ i = 0
45
  for doc in documents:
46
+ print(i)
47
  preprocessed_doc = preprocess_document(doc)
48
  all_words.extend(preprocessed_doc)
49
  per_doc_word_counts.append(Counter(preprocessed_doc))
50
+ i += 1
51
 
52
  # Overall word frequency
53
  overall_word_counts = Counter(all_words)
 
57
  candidate_words = []
58
  for word, count in overall_word_counts.items():
59
  doc_frequency = sum(1 for doc_count in per_doc_word_counts if doc_count[word] > 0)
60
+ if 0.35 * num_docs <= doc_frequency <= 0.75 * num_docs:
61
  candidate_words.append(word)
62
+ print("Done with dividing words")
63
 
64
  return candidate_words
65
 
requirements.txt CHANGED
@@ -3,4 +3,5 @@ streamlit==1.24.1
3
  plotly==5.15.0
4
  protobuf==3.20.0
5
  beautifulsoup4==4.12.2
6
- nltk==3.7
 
 
3
  plotly==5.15.0
4
  protobuf==3.20.0
5
  beautifulsoup4==4.12.2
6
+ nltk==3.7
7
+ tqdm