Spaces:
Runtime error
Runtime error
Benjamin Bossan
commited on
Commit
•
64d4f97
1
Parent(s):
c19ef6e
Add actual code for processing web pages
Browse filesThis uses trafilatura to extract the main text from a web page.
- pyproject.toml +2 -1
- requests.org +19 -5
- requirements.txt +2 -0
- src/gistillery/ml.py +8 -5
- src/gistillery/preprocessing.py +6 -3
- src/gistillery/registry.py +5 -2
- tests/test_app.py +62 -0
pyproject.toml
CHANGED
@@ -17,5 +17,6 @@ addopts = "--cov=src --cov-report=term-missing"
|
|
17 |
no_implicit_optional = true
|
18 |
strict = true
|
19 |
|
20 |
-
[[tool.mypy
|
|
|
21 |
ignore_missing_imports = true
|
|
|
17 |
no_implicit_optional = true
|
18 |
strict = true
|
19 |
|
20 |
+
[[tool.mypy.overrides]]
|
21 |
+
module = "transformers,trafilatura"
|
22 |
ignore_missing_imports = true
|
requests.org
CHANGED
@@ -22,7 +22,7 @@ curl -X 'POST' \
|
|
22 |
#+end_src
|
23 |
|
24 |
#+RESULTS:
|
25 |
-
: Submitted job
|
26 |
|
27 |
#+begin_src bash
|
28 |
curl -X 'POST' \
|
@@ -36,16 +36,30 @@ curl -X 'POST' \
|
|
36 |
#+end_src
|
37 |
|
38 |
#+RESULTS:
|
39 |
-
: Submitted job
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
#+begin_src bash
|
42 |
curl -X 'GET' \
|
43 |
-
'http://localhost:8080/check_job_status/
|
44 |
-H 'accept: application/json'
|
45 |
#+end_src
|
46 |
|
47 |
#+RESULTS:
|
48 |
-
| {"id":"
|
49 |
|
50 |
#+begin_src bash
|
51 |
curl -X 'GET' \
|
@@ -54,4 +68,4 @@ curl -X 'GET' \
|
|
54 |
#+end_src
|
55 |
|
56 |
#+RESULTS:
|
57 |
-
| [{"id":"
|
|
|
22 |
#+end_src
|
23 |
|
24 |
#+RESULTS:
|
25 |
+
: Submitted job 04deee1a2a9b4d6ea986ffe0fa4017d9
|
26 |
|
27 |
#+begin_src bash
|
28 |
curl -X 'POST' \
|
|
|
36 |
#+end_src
|
37 |
|
38 |
#+RESULTS:
|
39 |
+
: Submitted job 730352e00e8145b39971fdc386c28a8f
|
40 |
+
|
41 |
+
#+begin_src bash
|
42 |
+
curl -X 'POST' \
|
43 |
+
'http://localhost:8080/submit/' \
|
44 |
+
-H 'accept: application/json' \
|
45 |
+
-H 'Content-Type: application/json' \
|
46 |
+
-d '{
|
47 |
+
"author": "ben",
|
48 |
+
"content": "https://en.wikipedia.org/wiki/Goulburn_Street"
|
49 |
+
}'
|
50 |
+
#+end_src
|
51 |
+
|
52 |
+
#+RESULTS:
|
53 |
+
: Submitted job 1738d7daa96147198d80b93ea040863d
|
54 |
|
55 |
#+begin_src bash
|
56 |
curl -X 'GET' \
|
57 |
+
'http://localhost:8080/check_job_status/1738d7daa96147198d80b93ea040863d' \
|
58 |
-H 'accept: application/json'
|
59 |
#+end_src
|
60 |
|
61 |
#+RESULTS:
|
62 |
+
| {"id":"1738d7daa96147198d80b93ea040863d" | status:"pending" | last_updated:"2023-05-09T13:24:42"} |
|
63 |
|
64 |
#+begin_src bash
|
65 |
curl -X 'GET' \
|
|
|
68 |
#+end_src
|
69 |
|
70 |
#+RESULTS:
|
71 |
+
| [{"id":"1738d7daa96147198d80b93ea040863d" | author:"ben" | summary:"Goulburn Street is a street in the central business district of Sydney | New South Wales | Australia. It runs from Darling Harbour and Chinatown in the west to Crown Street in the east at Darlinghurst and Surry Hills. The only car park operated by Sydney City Council within the CBD is at the corner of Goulburn and Elizabeth Streets. It was the first air rights car park in Australia | opening in 1963 over six tracks of the City Circle line.[3][4]" | tags:["#centralbusinessdistrict" | #darlinghurst | #general | #goulburnstreet | #surryhills | #sydney | #sydneymasoniccentre] | date:"2023-05-09T13:24:42"} | {"id":"730352e00e8145b39971fdc386c28a8f" | author:"ben" | summary:"A new approach to NLP that incorporates reinforcement learning and human feedback. How does it work? Why does it work? In this post | I’ll explain how it works. RLHF is a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback." | tags:["#" | #general | #rlhf] | date:"2023-05-09T13:24:38"} | {"id":"04deee1a2a9b4d6ea986ffe0fa4017d9" | author:"ben" | summary:"GitLab | the most comprehensive | scalable enterprise DevSecOps platform for software innovation | and Google Cloud today announced an extension of their strategic partnership to deliver secure AI offerings to the enterprise. By leveraging Google Cloud's customizable foundation models and open generative AI infrastructure | GitLab will provide customers with AI-assisted features directly within the enterprise DevSecOps platform. The company's AI capabilities are designed to help enterprises improve productivity and reduce costs." | tags:["#ai-assistedfeatures" | #enterprisedevsecopsplatform | #general | #gitlab | #googlecloud] | date:"2023-05-09T13:24:36"}] |
|
requirements.txt
CHANGED
@@ -3,3 +3,5 @@ httpx
|
|
3 |
uvicorn[standard]
|
4 |
torch
|
5 |
transformers
|
|
|
|
|
|
3 |
uvicorn[standard]
|
4 |
torch
|
5 |
transformers
|
6 |
+
charset-normalizer
|
7 |
+
trafilatura
|
src/gistillery/ml.py
CHANGED
@@ -70,14 +70,17 @@ class HfTransformersTagger(Tagger):
|
|
70 |
|
71 |
self.template = (
|
72 |
"Create a list of tags for the text below. The tags should be high level "
|
73 |
-
"and specific.
|
|
|
74 |
)
|
75 |
|
76 |
def _extract_tags(self, text: str) -> list[str]:
|
77 |
-
tags =
|
78 |
-
for tag in text.split():
|
79 |
-
|
80 |
-
|
|
|
|
|
81 |
return sorted(tags)
|
82 |
|
83 |
def __call__(self, x: str) -> list[str]:
|
|
|
70 |
|
71 |
self.template = (
|
72 |
"Create a list of tags for the text below. The tags should be high level "
|
73 |
+
"and specific. Return the results as a comma separated list.\n\n"
|
74 |
+
"{}\n\nTags:\n"
|
75 |
)
|
76 |
|
77 |
def _extract_tags(self, text: str) -> list[str]:
|
78 |
+
tags = {"#general"}
|
79 |
+
for tag in text.split(","):
|
80 |
+
tag = tag.strip().lower().replace(" ", "")
|
81 |
+
if not tag.startswith("#"):
|
82 |
+
tag = "#" + tag
|
83 |
+
tags.add(tag)
|
84 |
return sorted(tags)
|
85 |
|
86 |
def __call__(self, x: str) -> list[str]:
|
src/gistillery/preprocessing.py
CHANGED
@@ -2,7 +2,8 @@ import abc
|
|
2 |
import logging
|
3 |
import re
|
4 |
|
5 |
-
import
|
|
|
6 |
|
7 |
from gistillery.base import JobInput
|
8 |
|
@@ -39,8 +40,9 @@ class RawTextProcessor(Processor):
|
|
39 |
|
40 |
|
41 |
class DefaultUrlProcessor(Processor):
|
|
|
42 |
def __init__(self) -> None:
|
43 |
-
self.client =
|
44 |
self.regex = re.compile(r"(https?://[^\s]+)")
|
45 |
self.url = None
|
46 |
self.template = "{url}\n\n{content}"
|
@@ -57,5 +59,6 @@ class DefaultUrlProcessor(Processor):
|
|
57 |
assert isinstance(self.url, str)
|
58 |
text = self.client.get(self.url).text
|
59 |
assert isinstance(text, str)
|
60 |
-
|
|
|
61 |
return text
|
|
|
2 |
import logging
|
3 |
import re
|
4 |
|
5 |
+
from httpx import Client
|
6 |
+
from trafilatura import extract
|
7 |
|
8 |
from gistillery.base import JobInput
|
9 |
|
|
|
40 |
|
41 |
|
42 |
class DefaultUrlProcessor(Processor):
|
43 |
+
# uses trafilatura to extract text from html
|
44 |
def __init__(self) -> None:
|
45 |
+
self.client = Client()
|
46 |
self.regex = re.compile(r"(https?://[^\s]+)")
|
47 |
self.url = None
|
48 |
self.template = "{url}\n\n{content}"
|
|
|
59 |
assert isinstance(self.url, str)
|
60 |
text = self.client.get(self.url).text
|
61 |
assert isinstance(text, str)
|
62 |
+
extracted = extract(text)
|
63 |
+
text = self.template.format(url=self.url, content=extracted)
|
64 |
return text
|
src/gistillery/registry.py
CHANGED
@@ -12,8 +12,11 @@ class MlRegistry:
|
|
12 |
self.model = None
|
13 |
self.tokenizer = None
|
14 |
|
15 |
-
def register_processor(self, processor: Processor) -> None:
|
16 |
-
|
|
|
|
|
|
|
17 |
|
18 |
def register_summarizer(self, summarizer: Summarizer) -> None:
|
19 |
self.summerizer = summarizer
|
|
|
12 |
self.model = None
|
13 |
self.tokenizer = None
|
14 |
|
15 |
+
def register_processor(self, processor: Processor, last: bool = True) -> None:
|
16 |
+
if last:
|
17 |
+
self.processors.append(processor)
|
18 |
+
else:
|
19 |
+
self.processors.insert(0, processor)
|
20 |
|
21 |
def register_summarizer(self, summarizer: Summarizer) -> None:
|
22 |
self.summerizer = summarizer
|
tests/test_app.py
CHANGED
@@ -234,3 +234,65 @@ class TestWebservice:
|
|
234 |
rows = cursor.execute("SELECT * FROM inputs").fetchall()
|
235 |
assert len(rows) == 1
|
236 |
assert rows[0].input == "this is a test"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
rows = cursor.execute("SELECT * FROM inputs").fetchall()
|
235 |
assert len(rows) == 1
|
236 |
assert rows[0].input == "this is a test"
|
237 |
+
|
238 |
+
def test_submit_url(self, client, cursor, mlregistry, monkeypatch):
|
239 |
+
class MockClient:
|
240 |
+
"""Mock httpx Client, return www.example.com content"""
|
241 |
+
|
242 |
+
def get(self, url):
|
243 |
+
return SimpleNamespace(
|
244 |
+
text=''' <!doctype html>\n<html>\n<head>\n <title>Example
|
245 |
+
Domain</title>\n\n <meta charset="utf-8" />\n <meta
|
246 |
+
http-equiv="Content-type" content="text/html; charset=utf-8"
|
247 |
+
/>\n <meta name="viewport" content="width=device-width,
|
248 |
+
initial-scale=1" />\n <style type="text/css">\n body {\n
|
249 |
+
background-color: #f0f0f2;\n margin: 0;\n padding: 0;\n
|
250 |
+
font-family: -apple-system, system-ui, BlinkMacSystemFont,
|
251 |
+
"Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial,
|
252 |
+
sans-serif;\n \n }\n div {\n width: 600px;\n margin: 5em
|
253 |
+
auto;\n padding: 2em;\n background-color: #fdfdff;\n
|
254 |
+
border-radius: 0.5em;\n box-shadow: 2px 3px 7px 2px
|
255 |
+
rgba(0,0,0,0.02);\n }\n a:link, a:visited {\n color:
|
256 |
+
#38488f;\n text-decoration: none;\n }\n @media (max-width:
|
257 |
+
700px) {\n div {\n margin: 0 auto;\n width: auto;\n }\n }\n
|
258 |
+
</style> \n</head>\n\n<body>\n<div>\n <h1>Example
|
259 |
+
Domain</h1>\n <p>This domain is for use in illustrative
|
260 |
+
examples in documents. You may use this\n domain in
|
261 |
+
literature without prior coordination or asking for
|
262 |
+
permission.</p>\n <p><a
|
263 |
+
href="https://www.iana.org/domains/example">More
|
264 |
+
information...</a></p>\n</div>\n</body>\n</html>\n'''
|
265 |
+
)
|
266 |
+
|
267 |
+
monkeypatch.setattr("gistillery.preprocessing.Client", MockClient)
|
268 |
+
|
269 |
+
from gistillery.preprocessing import DefaultUrlProcessor
|
270 |
+
|
271 |
+
# register url processor, put it before the default processor
|
272 |
+
mlregistry.register_processor(DefaultUrlProcessor(), last=False)
|
273 |
+
client.post(
|
274 |
+
"/submit",
|
275 |
+
json={
|
276 |
+
"author": "ben",
|
277 |
+
"content": "https://en.wikipedia.org/wiki/non-existing-page",
|
278 |
+
},
|
279 |
+
)
|
280 |
+
self.process_jobs(mlregistry)
|
281 |
+
|
282 |
+
rows = cursor.execute("SELECT * FROM inputs").fetchall()
|
283 |
+
assert len(rows) == 1
|
284 |
+
|
285 |
+
expected = "\n".join(
|
286 |
+
[
|
287 |
+
'https://en.wikipedia.org/wiki/non-existing-page',
|
288 |
+
'',
|
289 |
+
'This domain is for use in illustrative',
|
290 |
+
'examples in documents. You may use this',
|
291 |
+
'domain in',
|
292 |
+
'literature without prior coordination or asking for',
|
293 |
+
'permission.',
|
294 |
+
'More',
|
295 |
+
'information...',
|
296 |
+
]
|
297 |
+
)
|
298 |
+
assert rows[0].input == expected
|