url-to-markdown / app.py
moritalous's picture
Update app.py
aae4846 verified
raw
history blame
1.45 kB
import gradio as gr
import requests
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter
def md(soup, **options):
return MarkdownConverter(**options).convert_soup(soup)
def main_fn(url: str, check: list[int]):
response = requests.get(url)
soup = BeautifulSoup(response.text)
for tag in ["script", "style"]:
target = soup.find_all(tag)
for t in target:
t.clear
body = soup.find("body")
main = soup.find("main")
if main:
return md(main, strip=check)
return md(body)
demo = gr.Interface(
main_fn,
title="URL to Markdown V2",
description="""<div style="width: fit-content; margin: 0 auto;">It gets the HTML given by the URL and converts it to Markdown. It uses Playwright, so it also supports dynamically generated HTML such as React.</div>
<div style="width: fit-content; margin: 0 auto;">URLで与えたHTMLを取得してMarkdownに変換します。Playwright を使用しているのでReactなどの動的に生成されるHTMLにも対応しています</div>""",
inputs=[
gr.Text(label="URL", placeholder="https://*****"),
gr.CheckboxGroup(
label="Ignore tags(無視するタグ)",
choices=["a", "img", "noscript"],
value=["a", "img"],
),
],
outputs=[gr.TextArea(label="Markdown", show_copy_button=True)],
allow_flagging="never",
)
demo.launch()