|
|
|
|
|
import re |
|
|
|
def merge_documents(main_dict, additional_json, limit=1000): |
|
""" |
|
Adds a subset of documents from an additional JSON file to the main dictionary. |
|
|
|
Args: |
|
main_dict (dict): The main dictionary where processed documents are stored. |
|
additional_json (list): The additional JSON data containing documents. |
|
limit (int): The maximum number of documents to add to the main dictionary. |
|
|
|
Returns: |
|
dict: The updated main dictionary with additional documents added. |
|
""" |
|
|
|
count = 0 |
|
|
|
for doc in additional_json: |
|
if count >= limit: |
|
break |
|
|
|
|
|
wikipedia_id = doc.get("wikipedia_id") |
|
text = doc.get("text", []) |
|
|
|
|
|
if wikipedia_id not in main_dict: |
|
|
|
joined_text = " ".join(text) |
|
sanitized_text = sanitize_text(joined_text) |
|
|
|
|
|
main_dict[wikipedia_id] = sanitized_text |
|
count += 1 |
|
|
|
print(f"{count} documents added to the main dictionary.") |
|
return main_dict |
|
|
|
def sanitize_text(text): |
|
""" |
|
Cleans and standardizes text by keeping only alphanumeric characters and spaces. |
|
Args: |
|
text (str): Text to sanitize. |
|
Returns: |
|
str: Sanitized text. |
|
""" |
|
if isinstance(text, str): |
|
|
|
text = re.sub(r'[^a-zA-Z0-9\s]', '', text) |
|
|
|
text = re.sub(r'\s+', ' ', text).strip() |
|
return text |
|
|
|
|
|
def process_json_data(json_data): |
|
result_dict = {} |
|
|
|
for doc in json_data: |
|
|
|
wikipedia_id = doc.get("wikipedia_id") |
|
text = doc.get("text", []) |
|
|
|
|
|
joined_text = " ".join(text) |
|
sanitized_text = sanitize_text(joined_text) |
|
|
|
|
|
result_dict[wikipedia_id] = sanitized_text |
|
|
|
return result_dict |
|
|
|
def process_queries(json_data): |
|
""" |
|
Processes a JSON object containing queries and query IDs. |
|
|
|
Args: |
|
json_data (dict): The input JSON data. |
|
|
|
Returns: |
|
dict: A dictionary with query_id as the key and query text as the value. |
|
""" |
|
result_dict = {} |
|
|
|
for query_id, query_info in json_data.items(): |
|
|
|
query_text = query_info.get("input", "") |
|
|
|
|
|
result_dict[query_id] = query_text |
|
|
|
return result_dict |
|
|
|
|
|
|
|
|
|
|
|
|