TSmarizer

Sleeping

App Files Files Community

TSmarizer / utils.py

cagataydag

Update utils.py

0819121 over 1 year ago

raw

history blame

3.96 kB

	import re
	import requests
	import docx2txt
	from io import StringIO
	from PyPDF2 import PdfReader

	from bs4 import BeautifulSoup
	from nltk.tokenize import sent_tokenize

	emoji_pattern = re.compile(
	"["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	"]+",
	flags=re.UNICODE,
	)


	def clean_text(x):
	# x = x.lower() # lowercase
	x = x.encode("ascii", "ignore").decode() # unicode
	x = re.sub(r"https*\S+", " ", x) # url
	x = re.sub(r"@\S+", " ", x) # mentions
	x = re.sub(r"#\S+", " ", x) # hastags
	# x = x.replace("'", "") # remove ticks
	# x = re.sub("[%s]" % re.escape(string.punctuation), " ", x) # punctuation
	# x = re.sub(r"\w\d+\w", "", x) # numbers
	x = re.sub(r"\s{2,}", " ", x) # over spaces
	x = emoji_pattern.sub(r"", x) # emojis
	x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?

	return x


	def fetch_article_text(url: str):

	r = requests.get(url)
	soup = BeautifulSoup(r.text, "html.parser")
	results = soup.find_all(["h1", "p"])
	text = [result.text for result in results]
	ARTICLE = " ".join(text)
	ARTICLE = ARTICLE.replace(".", ".<eos>")
	ARTICLE = ARTICLE.replace("!", "!<eos>")
	ARTICLE = ARTICLE.replace("?", "?<eos>")
	sentences = ARTICLE.split("<eos>")
	current_chunk = 0
	chunks = []
	for sentence in sentences:
	if len(chunks) == current_chunk + 1:
	if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
	chunks[current_chunk].extend(sentence.split(" "))
	else:
	current_chunk += 1
	chunks.append(sentence.split(" "))
	else:
	print(current_chunk)
	chunks.append(sentence.split(" "))

	for chunk_id in range(len(chunks)):
	chunks[chunk_id] = " ".join(chunks[chunk_id])

	return ARTICLE, chunks


	def preprocess_text_for_abstractive_summarization(tokenizer, text):
	sentences = sent_tokenize(text)

	# initialize
	length = 0
	chunk = ""
	chunks = []
	count = -1
	for sentence in sentences:
	count += 1
	combined_length = (
	len(tokenizer.tokenize(sentence)) + length
	) # add the no. of sentence tokens to the length counter

	if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
	chunk += sentence + " " # add the sentence to the chunk
	length = combined_length # update the length counter

	# if it is the last sentence
	if count == len(sentences) - 1:
	chunks.append(chunk.strip()) # save the chunk

	else:
	chunks.append(chunk.strip()) # save the chunk

	# reset
	length = 0
	chunk = ""

	# take care of the overflow sentence
	chunk += sentence + " "
	length = len(tokenizer.tokenize(sentence))

	return chunks


	def read_pdf(file):
	pdfReader = PdfReader(file)
	count = len(pdfReader.pages)
	all_page_text = ""
	for i in range(count):
	page = pdfReader.getPage(i)
	all_page_text += page.extractText()

	return all_page_text


	def read_text_from_file(file):

	# read text file
	if file.type == "text/plain":
	# To convert to a string based IO:
	stringio = StringIO(file.getvalue().decode("utf-8"))

	# To read file as string:
	file_content = stringio.read()

	# read pdf file
	elif file.type == "application/pdf":
	file_content = read_pdf(file)

	# read docx file
	elif (
	file.type
	== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	):
	file_content = docx2txt.process(file)

	return file_content