url-to-markdown / app.py
moritalous's picture
Update app.py
5e0955d verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter
def md(soup, **options):
return MarkdownConverter(**options).convert_soup(soup)
def main_fn(url: str, check: list[int]):
response = requests.get(url)
soup = BeautifulSoup(response.text)
for tag in ["script", "style"]:
target = soup.find_all(tag)
for t in target:
t.clear
body = soup.find("body")
main = soup.find("main")
if main:
return md(main, strip=check)
return md(body)
demo = gr.Interface(
main_fn,
title="URL to Markdown",
description="""<div style="width: fit-content; margin: 0 auto;">Gets HTML given by URL and converts it to Markdown.Does not support dynamically generated HTML such as React.</div>
<div style="width: fit-content; margin: 0 auto;">URLで与えたHTMLを取得してMarkdownに変換します。Reactなどの動的に生成されるHTMLには対応していません</div>
<div style="width: fit-content; margin: 0 auto;"><a href="https://huggingface.co/spaces/moritalous/url-to-markdown-v2">New Version is here.</a></div>""",
inputs=[
gr.Text(label="URL", placeholder="https://*****"),
gr.CheckboxGroup(
label="Ignore tags(無視するタグ)",
choices=["a", "img", "noscript"],
value=["a", "img"],
),
],
outputs=[gr.TextArea(label="Markdown", show_copy_button=True)],
allow_flagging="never",
)
demo.launch()