Spaces:
Sleeping
Sleeping
from unstructured.partition.html import partition_html | |
#source = 'https://www.linkedin.com/posts/jobanpreet-singh-392581207_asr-whisper-speechrecognition-activity-7172803455718158336-MC-j?utm_source=share&utm_medium=member_desktop' | |
def scrappost(url): | |
all_groups = [] | |
group = {'page_content': ''} | |
# ingest and preprocess webpage into Unstructured elements object | |
glossary_page = partition_html(url=url) | |
# iterate the document elements and group texts by title | |
for element in glossary_page: | |
if 'unstructured.documents.html.HTMLTitle' in str(type(element)): | |
# If there's already content in the group, add it to all_groups | |
if group['page_content']: | |
all_groups.append(group) | |
group = {'page_content': ''} | |
group['page_content'] += element.text | |
if 'unstructured.documents.html.HTMLNarrativeText' in str(type(element)): | |
group['page_content'] += element.text | |
if "unstructured.documents.html.HTMLListItem" in str(type(element)): | |
group['page_content']+=element.text | |
# # Add the last group if it exists | |
if group['page_content']: | |
all_groups.append(group) | |
# Print the groups | |
for group in all_groups[:1]: | |
return group["page_content"] |