from Baseline.data_processor import process_json_data, process_queries, merge_documents from Baseline.boolean_retrieval import main_boolean_retrieval, retrieve_single_query import json def boolean_pipeline(query, wikipedia_data_path="Datasets/mini_wiki_collection.json", top_n=100): # Load the JSON files with open(wikipedia_data_path, "r") as file1: wikipedia_data = json.load(file1) # Process the JSON files wikipedia_dict = process_json_data(wikipedia_data) # Print the processed data top_results = retrieve_single_query(query, wikipedia_dict, top_n) return top_results # def main(): # # Load the JSON files # # boolean_retrieval("In the United States, why are positions like Attorney General, Secretary of State, etc. appointed by the president at the federal level but elected by the people at the state level? Had it ever been proposed to do this differently?") # # return # with open("../Datasets/mini_wiki_collection.json", "r") as file1: # Replace with the actual path to your file # wikipedia_data = json.load(file1) # with open("../Datasets/mini_wiki_collection_10000_documents.json", "r") as file1: # Replace with the actual path to your file # additional_json_file = json.load(file1) # with open("../Datasets/FinalDataset_WithModifiedQuery.json", "r") as file2: # Replace with the actual path to your file # queries_data = json.load(file2) # # Process the JSON files # wikipedia_dict = process_json_data(wikipedia_data) # updated_main_dict = merge_documents(wikipedia_dict, additional_json_file, limit=2000) # queries_dict = process_queries(queries_data) # # Print the processed data # print("Processed Wikipedia Data:") # print(wikipedia_dict["420538"]) # print("\nProcessed Queries Data:") # print(queries_dict["5xvggq"]) # top_results = main_boolean_retrieval(updated_main_dict, queries_dict) # # Print the results for a specific query # print("\nTop results for query '5xvggq':") # print(top_results.get("5xvggq", [])) # # Optionally, save the top results to a JSON file # with open("boolean_retrieval_1_2_query.json", "w") as output_file: # json.dump(top_results, output_file, indent=4) # # if __name__ == "__main__": # # main()