moritalous's picture
Update app.py
4fafc06 verified
raw
history blame
2.17 kB
import gradio as gr
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter
from playwright.sync_api import sync_playwright
def md(soup, **options):
return MarkdownConverter(**options).convert_soup(soup)
def main_fn(url: str, check: list[int], request: gr.Request):
user_agent = request.headers["user-agent"]
with sync_playwright() as p:
browser = p.chromium.launch(
args=[
"--single-process",
"--no-zygote",
"--no-sandbox",
"--disable-gpu",
"--disable-dev-shm-usage",
"--headless=new",
]
)
context = browser.new_context(user_agent=user_agent)
page = context.new_page()
response = page.goto(url=url)
status = response.status
content = page.content()
title = page.title()
browser.close()
soup = BeautifulSoup(content, features="html.parser")
for tag in ["script", "style"]:
target = soup.find_all(tag)
for t in target:
t.clear
body = soup.find("body")
main = soup.find("main")
body = md(body)
if main:
body = md(main, strip=check)
return f"{title}\n======\n\n{body}"
demo = gr.Interface(
main_fn,
title="URL to Markdown V2",
description="""<div style="width: fit-content; margin: 0 auto;">It gets the HTML given by the URL and converts it to Markdown. It uses Playwright, so it also supports dynamically generated HTML such as React.</div>
<div style="width: fit-content; margin: 0 auto;">URLで与えたHTMLを取得してMarkdownに変換します。Playwright を使用しているのでReactなどの動的に生成されるHTMLにも対応しています</div>""",
inputs=[
gr.Text(label="URL", placeholder="https://*****"),
gr.CheckboxGroup(
label="Ignore tags(無視するタグ)",
choices=["a", "img", "noscript"],
value=["a", "img"],
),
],
outputs=[gr.TextArea(label="Markdown", show_copy_button=True)],
allow_flagging="never",
)
demo.launch(server_name="0.0.0.0")