Spaces:
Sleeping
Sleeping
File size: 1,445 Bytes
b3702ea aae4846 b3702ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter
def md(soup, **options):
return MarkdownConverter(**options).convert_soup(soup)
def main_fn(url: str, check: list[int]):
response = requests.get(url)
soup = BeautifulSoup(response.text)
for tag in ["script", "style"]:
target = soup.find_all(tag)
for t in target:
t.clear
body = soup.find("body")
main = soup.find("main")
if main:
return md(main, strip=check)
return md(body)
demo = gr.Interface(
main_fn,
title="URL to Markdown V2",
description="""<div style="width: fit-content; margin: 0 auto;">It gets the HTML given by the URL and converts it to Markdown. It uses Playwright, so it also supports dynamically generated HTML such as React.</div>
<div style="width: fit-content; margin: 0 auto;">URLで与えたHTMLを取得してMarkdownに変換します。Playwright を使用しているのでReactなどの動的に生成されるHTMLにも対応しています</div>""",
inputs=[
gr.Text(label="URL", placeholder="https://*****"),
gr.CheckboxGroup(
label="Ignore tags(無視するタグ)",
choices=["a", "img", "noscript"],
value=["a", "img"],
),
],
outputs=[gr.TextArea(label="Markdown", show_copy_button=True)],
allow_flagging="never",
)
demo.launch()
|