from unstructured.partition.html import partition_html #source = 'https://www.linkedin.com/posts/jobanpreet-singh-392581207_asr-whisper-speechrecognition-activity-7172803455718158336-MC-j?utm_source=share&utm_medium=member_desktop' def scrappost(url): all_groups = [] group = {'page_content': ''} # ingest and preprocess webpage into Unstructured elements object glossary_page = partition_html(url=url) # iterate the document elements and group texts by title for element in glossary_page: if 'unstructured.documents.html.HTMLTitle' in str(type(element)): # If there's already content in the group, add it to all_groups if group['page_content']: all_groups.append(group) group = {'page_content': ''} group['page_content'] += element.text if 'unstructured.documents.html.HTMLNarrativeText' in str(type(element)): group['page_content'] += element.text if "unstructured.documents.html.HTMLListItem" in str(type(element)): group['page_content']+=element.text # # Add the last group if it exists if group['page_content']: all_groups.append(group) # Print the groups for group in all_groups[:1]: return group["page_content"]