from markdownify import markdownify as md from bs4 import BeautifulSoup as BS from IPython.display import display, Markdown from urllib.parse import urljoin from newspaper import Article import re import markdown def clean(s): s = s.replace("\t", "\\t") s = s.replace("\n", "\\n") return s class DocTree: def __init__(self, content): self.content = content self.max_depth = 6 def get_sections(self, *location_ids): out = self.content for id_ in location_ids: out = out[id_] return out def merge_sections(self, elems): if not isinstance(elems[0], list): return '\n\n '.join(elems) out = [] for e in elems: out.append(self.merge_sections(e)) return '\n\n '.join(map(clean, out)) def get_merged_sections(self, *location_ids): return [self.merge_sections(s) for s in self.get_sections(*location_ids)] def as_markdown(self, content): return md(content) def get_sections_by_depth(self, depth): return self._get_sections_by_depth(self.content, depth) @staticmethod def _get_sections_by_depth(content, depth): """Returns a list of merged sections at a specific depth""" if depth == 0: return content out = [] for elem in content: out += DocTree._get_sections_by_depth(elem, depth - 1) return out def fix_relative_links(url, article_content): if 'http' in url: base_url = '/'.join(url.split('/')[:3]) else: base_url = url.split('/') pat = re.compile(r'\[(.*?)\]\((.*?)\)', flags=re.IGNORECASE) res = pat.findall(article_content) if res: for g in res: url = urljoin(base_url, g[1]) if g[1].startswith('/') else g[1] article_content = article_content.replace(f'[{g[0]}]({g[1]})', f'[{g[0]}]({url})') else:print('not found') return article_content def extract_article(url): article = Article(url) article.parse() return article def select_content(html_code, elem_class, class_name): print(f'Calling select_content with {elem_class}, {class_name}') if class_name.startswith('.'): class_name = class_name[1:] elem_id = None elif class_name.startswith('#'): elem_id = class_name[1:] class_name = None else: elem_id = None class_name = None return md(str(BS(html_code, features="lxml").find(elem_class, class_=class_name, id=elem_id))) def split_by_heading(html_content, _i): if _i >= 7: return html_content elems = [] for idx, elem in enumerate([i for i in html_content.split(f' 0 or elem.startswith('>'): elem = f'