LinkedInpost / scrap_post.py
Jobanpreet's picture
Upload 4 files
46290fc verified
raw
history blame
1.3 kB
from unstructured.partition.html import partition_html
#source = 'https://www.linkedin.com/posts/jobanpreet-singh-392581207_asr-whisper-speechrecognition-activity-7172803455718158336-MC-j?utm_source=share&utm_medium=member_desktop'
def scrappost(url):
all_groups = []
group = {'page_content': ''}
# ingest and preprocess webpage into Unstructured elements object
glossary_page = partition_html(url=url)
# iterate the document elements and group texts by title
for element in glossary_page:
if 'unstructured.documents.html.HTMLTitle' in str(type(element)):
# If there's already content in the group, add it to all_groups
if group['page_content']:
all_groups.append(group)
group = {'page_content': ''}
group['page_content'] += element.text
if 'unstructured.documents.html.HTMLNarrativeText' in str(type(element)):
group['page_content'] += element.text
if "unstructured.documents.html.HTMLListItem" in str(type(element)):
group['page_content']+=element.text
# # Add the last group if it exists
if group['page_content']:
all_groups.append(group)
# Print the groups
for group in all_groups[:1]:
return group["page_content"]