pgbot / clean_pg_data.py
alistairmcleay's picture
Build MMMVP of the app
9716b27
raw
history blame
879 Bytes
PG_ESSAYS_FILENAME = "paul_graham_essays.txt"
CLEANED_PG_ESSAYS_FILENAME = "paul_graham_essays_cleaned.txt"
import re
from bs4 import BeautifulSoup
# Read in the txt file PG_ESSAYS_FILENAME and convert it from html to plain text using the BeautifulSoup library.
# Write the new text to CLEANED_PG_ESSAYS_FILENAME.
def clean_pg_data():
with open(PG_ESSAYS_FILENAME, 'r') as f:
html = f.read()
soup = BeautifulSoup(html, 'html.parser')
text = soup.get_text()
# Take all instances of "20" followed by two numbers and replace them with "----"
text = re.sub(r'20\d\d', '----', text)
# Delete the word that occurs before all instances of "----"
text = re.sub(r'\w+ ----', '----', text)
with open(CLEANED_PG_ESSAYS_FILENAME, 'w') as f:
f.write(text)
if __name__ == "__main__":
clean_pg_data()