moritalous commited on
Commit
1f9c201
1 Parent(s): 0c43f3c

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +25 -0
  2. app.py +74 -0
  3. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM mcr.microsoft.com/playwright:v1.47.0-noble
2
+
3
+ RUN apt-get update -q && apt-get install -qy python3 python3-pip
4
+
5
+ # Switch to the "ubuntu" user
6
+ USER ubuntu
7
+
8
+ # Set home to the ubuntu's home directory
9
+ ENV HOME=/home/ubuntu \
10
+ PATH=/home/ubuntu/.local/bin:$PATH
11
+
12
+ # Set the working directory to the user's home directory
13
+ WORKDIR $HOME/app
14
+
15
+ # Try and run pip command after setting the user with `USER user` to avoid permission issues with Python
16
+ COPY --chown=ubuntu requirements.txt $HOME/app
17
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt --break-system-packages
18
+
19
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
20
+ COPY --chown=ubuntu app.py $HOME/app
21
+
22
+ EXPOSE 7860
23
+
24
+ ENTRYPOINT [ "python3" ]
25
+ CMD [ "app.py" ]
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from bs4 import BeautifulSoup
3
+ from markdownify import MarkdownConverter
4
+ from playwright.sync_api import sync_playwright
5
+
6
+
7
+ def md(soup, **options):
8
+ return MarkdownConverter(**options).convert_soup(soup)
9
+
10
+
11
+ def main_fn(url: str, check: list[int], request: gr.Request):
12
+
13
+ user_agent = request.headers["user-agent"]
14
+
15
+ with sync_playwright() as p:
16
+
17
+ browser = p.chromium.launch(
18
+ args=[
19
+ "--single-process",
20
+ "--no-zygote",
21
+ "--no-sandbox",
22
+ "--disable-gpu",
23
+ "--disable-dev-shm-usage",
24
+ "--headless=new",
25
+ ]
26
+ )
27
+
28
+ context = browser.new_context(user_agent=user_agent)
29
+ page = context.new_page()
30
+
31
+ response = page.goto(url=url)
32
+ status = response.status
33
+
34
+ content = page.content()
35
+ title = page.title()
36
+
37
+ browser.close()
38
+
39
+ soup = BeautifulSoup(content, features="html.parser")
40
+
41
+ for tag in ["script", "style"]:
42
+ target = soup.find_all(tag)
43
+ for t in target:
44
+ t.clear
45
+
46
+ body = soup.find("body")
47
+ main = soup.find("main")
48
+
49
+ body = md(body)
50
+
51
+ if main:
52
+ body = md(main, strip=check)
53
+
54
+ return f"{title}\n======\n\n{body}"
55
+
56
+
57
+ demo = gr.Interface(
58
+ main_fn,
59
+ title="URL to Markdown",
60
+ description="""<div style="width: fit-content; margin: 0 auto;">Gets HTML given by URL and converts it to Markdown.Does not support dynamically generated HTML such as React.</div>
61
+ <div style="width: fit-content; margin: 0 auto;">URLで与えたHTMLを取得してMarkdownに変換します。Reactなどの動的に生成されるHTMLには対応していません</div>""",
62
+ inputs=[
63
+ gr.Text(label="URL", placeholder="https://*****"),
64
+ gr.CheckboxGroup(
65
+ label="Ignore tags(無視するタグ)",
66
+ choices=["a", "img", "noscript"],
67
+ value=["a", "img"],
68
+ ),
69
+ ],
70
+ outputs=[gr.TextArea(label="Markdown", show_copy_button=True)],
71
+ allow_flagging="never",
72
+ )
73
+
74
+ demo.launch(server_name="0.0.0.0")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ beautifulsoup4
2
+ gradio
3
+ markdownify
4
+ playwright==1.47.0
5
+ requests