import os import re from pathlib import Path from bs4 import BeautifulSoup def process_html_file(file_path, output_path): with open(file_path, 'r', encoding='utf-8') as file: content = file.read() soup = BeautifulSoup(content, 'html.parser') # Find the Statement line statement_tag = soup.find(lambda tag: tag.name == "h3" and tag.find("span", string="Statement:")) if statement_tag: # Extract the text content statement_text = statement_tag.get_text(strip=True) # Remove "in the table:" and everything after it new_statement = re.sub(r'\s*in the table:.*$', '', statement_text, flags=re.DOTALL) # Reconstruct the h3 tag with the modified content new_h3 = soup.new_tag('h3') new_span = soup.new_tag('span') new_span.string = 'Statement:' new_h3.append(new_span) new_h3.append(f" {new_statement}") # Replace the old h3 tag with the new one statement_tag.replace_with(new_h3) # Write the modified content with open(output_path, 'w', encoding='utf-8') as file: file.write(str(soup)) def process_directory(input_dir, output_dir): subfolders = ['TP', 'TN', 'FP', 'FN'] for subfolder in subfolders: input_subfolder = Path(input_dir) / subfolder output_subfolder = Path(output_dir) / subfolder if not input_subfolder.exists(): print(f"Warning: {input_subfolder} does not exist. Skipping.") continue output_subfolder.mkdir(parents=True, exist_ok=True) for file in input_subfolder.glob('*.html'): output_file = output_subfolder / file.name process_html_file(file, output_file) print(f"Processed: {file} -> {output_file}") # Define input and output directories input_directory = "htmls_DATER_mod" output_directory = "htmls_DATER_mod2" # Process the files process_directory(input_directory, output_directory) print("Processing complete. Modified files are in the output directory.")