Spaces:

Jobanpreet
/

LinkedInpost

Sleeping

LinkedInpost / scrap_post.py

Upload 4 files

46290fc verified 7 months ago

1.3 kB


	from unstructured.partition.html import partition_html
	#source = 'https://www.linkedin.com/posts/jobanpreet-singh-392581207_asr-whisper-speechrecognition-activity-7172803455718158336-MC-j?utm_source=share&utm_medium=member_desktop'


	def scrappost(url):
	all_groups = []
	group = {'page_content': ''}

	# ingest and preprocess webpage into Unstructured elements object
	glossary_page = partition_html(url=url)

	# iterate the document elements and group texts by title
	for element in glossary_page:
	if 'unstructured.documents.html.HTMLTitle' in str(type(element)):
	# If there's already content in the group, add it to all_groups
	if group['page_content']:
	all_groups.append(group)
	group = {'page_content': ''}
	group['page_content'] += element.text
	if 'unstructured.documents.html.HTMLNarrativeText' in str(type(element)):
	group['page_content'] += element.text

	if "unstructured.documents.html.HTMLListItem" in str(type(element)):
	group['page_content']+=element.text

	# # Add the last group if it exists
	if group['page_content']:
	all_groups.append(group)

	# Print the groups
	for group in all_groups[:1]:
	return group["page_content"]