Spaces:

IR-IIITH
/

MultiAgent-OpenDomain-QnA-System

Sleeping

App Files Files Community

MultiAgent-OpenDomain-QnA-System / Baseline /data_processor.py

raghuv-aditya

Upload 24 files

9f21f05 verified 4 months ago

raw

history blame contribute delete

3.02 kB

	# Assuming sanitize_text is a function you've defined elsewhere

	import re

	def merge_documents(main_dict, additional_json, limit=1000):
	"""
	Adds a subset of documents from an additional JSON file to the main dictionary.

	Args:
	main_dict (dict): The main dictionary where processed documents are stored.
	additional_json (list): The additional JSON data containing documents.
	limit (int): The maximum number of documents to add to the main dictionary.

	Returns:
	dict: The updated main dictionary with additional documents added.
	"""
	# Counter to track how many documents have been added
	count = 0

	for doc in additional_json:
	if count >= limit:
	break

	# Extract wikipedia_id and text from the document
	wikipedia_id = doc.get("wikipedia_id")
	text = doc.get("text", [])

	# Check if the document ID is unique to avoid overwriting
	if wikipedia_id not in main_dict:
	# Process and sanitize the document
	joined_text = " ".join(text)
	sanitized_text = sanitize_text(joined_text)

	# Add to the main dictionary
	main_dict[wikipedia_id] = sanitized_text
	count += 1

	print(f"{count} documents added to the main dictionary.")
	return main_dict

	def sanitize_text(text):
	"""
	Cleans and standardizes text by keeping only alphanumeric characters and spaces.
	Args:
	text (str): Text to sanitize.
	Returns:
	str: Sanitized text.
	"""
	if isinstance(text, str):
	# Use regex to keep only alphanumeric characters and spaces
	text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
	# Optionally, collapse multiple spaces into a single space
	text = re.sub(r'\s+', ' ', text).strip()
	return text


	def process_json_data(json_data):
	result_dict = {}

	for doc in json_data:
	# Extract wikipedia_id and text
	wikipedia_id = doc.get("wikipedia_id")
	text = doc.get("text", [])

	# Join the text content and sanitize
	joined_text = " ".join(text)
	sanitized_text = sanitize_text(joined_text)

	# Store in the dictionary
	result_dict[wikipedia_id] = sanitized_text

	return result_dict

	def process_queries(json_data):
	"""
	Processes a JSON object containing queries and query IDs.

	Args:
	json_data (dict): The input JSON data.

	Returns:
	dict: A dictionary with query_id as the key and query text as the value.
	"""
	result_dict = {}

	for query_id, query_info in json_data.items():
	# Extract the query input
	query_text = query_info.get("input", "")

	# Store query_id and text in the result dictionary
	result_dict[query_id] = query_text

	return result_dict

	# Example usage
	# Assuming `query_json_file` contains your JSON data
	# processed_queries = process_queries(query_json_file)