luulinh90s's picture
update
8873a5c
raw
history blame
1.17 kB
import os
import shutil
from pathlib import Path
import re
method = 'POS'
def process_html_file(file_path, output_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Remove lines containing 'Prediction'
content = re.sub(r'<h3>Prediction:.*?</h3>\n?', '', content, flags=re.DOTALL)
with open(output_path, 'w', encoding='utf-8') as file:
file.write(content)
def process_directory(input_dir, output_dir):
for root, dirs, files in os.walk(input_dir):
for file in files:
if file.endswith('.html'):
input_path = Path(root) / file
relative_path = input_path.relative_to(input_dir)
output_path = Path(output_dir) / relative_path
output_path.parent.mkdir(parents=True, exist_ok=True)
process_html_file(input_path, output_path)
# Define input and output directories
input_directory = f"htmls_{method}"
output_directory = f"htmls_{method}_mod"
# Process the files
process_directory(input_directory, output_directory)
print(f"Processing complete for {method}. Modified files are in the output directory.")