Spaces:
Sleeping
Sleeping
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
from markdownify import MarkdownConverter | |
def md(soup, **options): | |
return MarkdownConverter(**options).convert_soup(soup) | |
def main_fn(url: str, check: list[int]): | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text) | |
for tag in ["script", "style"]: | |
target = soup.find_all(tag) | |
for t in target: | |
t.clear | |
body = soup.find("body") | |
main = soup.find("main") | |
if main: | |
return md(main, strip=check) | |
return md(body) | |
demo = gr.Interface( | |
main_fn, | |
title="URL to Markdown V2", | |
description="""<div style="width: fit-content; margin: 0 auto;">It gets the HTML given by the URL and converts it to Markdown. It uses Playwright, so it also supports dynamically generated HTML such as React.</div> | |
<div style="width: fit-content; margin: 0 auto;">URLで与えたHTMLを取得してMarkdownに変換します。Playwright を使用しているのでReactなどの動的に生成されるHTMLにも対応しています</div>""", | |
inputs=[ | |
gr.Text(label="URL", placeholder="https://*****"), | |
gr.CheckboxGroup( | |
label="Ignore tags(無視するタグ)", | |
choices=["a", "img", "noscript"], | |
value=["a", "img"], | |
), | |
], | |
outputs=[gr.TextArea(label="Markdown", show_copy_button=True)], | |
allow_flagging="never", | |
) | |
demo.launch() | |