Tabular-LLM-Study-Forward-Simulation / modify_html_for_pos.py
luulinh90s's picture
update
8431e47
raw
history blame
1.47 kB
import os
import shutil
from pathlib import Path
import re
from bs4 import BeautifulSoup
def process_html_file(file_path, output_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Remove lines containing 'Prediction'
content = re.sub(r'<h3>Prediction:.*?</h3>\n?', '', content, flags=re.DOTALL)
# Parse the HTML content
soup = BeautifulSoup(content, 'html.parser')
# Remove tables with 'verification_result' column
for table in soup.find_all('table'):
if table.find('td', string='verification_result'):
table.decompose()
# Write the modified content
with open(output_path, 'w', encoding='utf-8') as file:
file.write(str(soup))
def process_directory(input_dir, output_dir):
for root, dirs, files in os.walk(input_dir):
for file in files:
if file.endswith('.html'):
input_path = Path(root) / file
relative_path = input_path.relative_to(input_dir)
output_path = Path(output_dir) / relative_path
output_path.parent.mkdir(parents=True, exist_ok=True)
process_html_file(input_path, output_path)
# Define input and output directories
input_directory = "htmls_POS"
output_directory = "htmls_POS_mod2"
# Process the files
process_directory(input_directory, output_directory)
print("Processing complete for POS. Modified files are in the output directory.")