Spaces:

BenjaminB
/

gistillery

Runtime error

App Files Files Community

Benjamin Bossan commited on May 9, 2023

Commit

64d4f97

•

1 Parent(s): c19ef6e

Add actual code for processing web pages

Browse files

This uses trafilatura to extract the main text from a web page.

Files changed (7) hide show

pyproject.toml +2 -1
requests.org +19 -5
requirements.txt +2 -0
src/gistillery/ml.py +8 -5
src/gistillery/preprocessing.py +6 -3
src/gistillery/registry.py +5 -2
tests/test_app.py +62 -0

pyproject.toml CHANGED Viewed

@@ -17,5 +17,6 @@ addopts = "--cov=src --cov-report=term-missing"
 no_implicit_optional = true
 strict = true
-[[tool.mypy-transformers]]
 ignore_missing_imports = true

 no_implicit_optional = true
 strict = true
+[[tool.mypy.overrides]]
+module = "transformers,trafilatura"
 ignore_missing_imports = true

requests.org CHANGED Viewed

@@ -22,7 +22,7 @@ curl -X 'POST' \
 #+end_src
 #+RESULTS:
-: Submitted job 6012b198ffe0467d9344a196a2ced121
 #+begin_src bash
 curl -X 'POST' \
@@ -36,16 +36,30 @@ curl -X 'POST' \
 #+end_src
 #+RESULTS:
-: Submitted job 05058b906f524fb4bfedc4f5a84eff06
 #+begin_src bash
 curl -X 'GET' \
-  'http://localhost:8080/check_job_status/6012b198ffe0467d9344a196a2ced121' \
   -H 'accept: application/json'
 #+end_src
 #+RESULTS:
-| {"id":"6012b198ffe0467d9344a196a2ced121" | status:"done" | last_updated:"2023-05-08T12:27:07"} |
 #+begin_src bash
 curl -X 'GET' \
@@ -54,4 +68,4 @@ curl -X 'GET' \
 #+end_src
 #+RESULTS:
-| [{"id":"05058b906f524fb4bfedc4f5a84eff06" | author:"ben" | summary:"A new approach to NLP that incorporates reinforcement learning and human feedback. How does it work? Why does it work? In this post | I’ll explain how it works. RLHF is a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback." | tags:["#general" | #rlhf] | date:"2023-05-08T12:27:31"} | {"id":"6012b198ffe0467d9344a196a2ced121" | author:"ben" | summary:"GitLab | the most comprehensive | scalable enterprise DevSecOps platform for software innovation | and Google Cloud today announced an extension of their strategic partnership to deliver secure AI offerings to the enterprise. By leveraging Google Cloud's customizable foundation models and open generative AI infrastructure | GitLab will provide customers with AI-assisted features directly within the enterprise DevSecOps platform. The company's AI capabilities are designed to help enterprises improve productivity and reduce costs." | tags:["#general"] | date:"2023-05-08T12:27:07"}] |

 #+end_src
 #+RESULTS:
+: Submitted job 04deee1a2a9b4d6ea986ffe0fa4017d9
 #+begin_src bash
 curl -X 'POST' \
 #+end_src
 #+RESULTS:
+: Submitted job 730352e00e8145b39971fdc386c28a8f
+#+begin_src bash
+curl -X 'POST' \
+  'http://localhost:8080/submit/' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: application/json' \
+  -d '{
+  "author": "ben",
+  "content": "https://en.wikipedia.org/wiki/Goulburn_Street"
+}'
+#+end_src
+#+RESULTS:
+: Submitted job 1738d7daa96147198d80b93ea040863d
 #+begin_src bash
 curl -X 'GET' \
+  'http://localhost:8080/check_job_status/1738d7daa96147198d80b93ea040863d' \
   -H 'accept: application/json'
 #+end_src
 #+RESULTS:
+| {"id":"1738d7daa96147198d80b93ea040863d" | status:"pending" | last_updated:"2023-05-09T13:24:42"} |
 #+begin_src bash
 curl -X 'GET' \
 #+end_src
 #+RESULTS:
+| [{"id":"1738d7daa96147198d80b93ea040863d" | author:"ben" | summary:"Goulburn Street is a street in the central business district of Sydney | New South Wales | Australia. It runs from Darling Harbour and Chinatown in the west to Crown Street in the east at Darlinghurst and Surry Hills. The only car park operated by Sydney City Council within the CBD is at the corner of Goulburn and Elizabeth Streets. It was the first air rights car park in Australia | opening in 1963 over six tracks of the City Circle line.[3][4]" | tags:["#centralbusinessdistrict" | #darlinghurst | #general | #goulburnstreet | #surryhills | #sydney | #sydneymasoniccentre] | date:"2023-05-09T13:24:42"} | {"id":"730352e00e8145b39971fdc386c28a8f" | author:"ben" | summary:"A new approach to NLP that incorporates reinforcement learning and human feedback. How does it work? Why does it work? In this post | I’ll explain how it works. RLHF is a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback." | tags:["#" | #general | #rlhf] | date:"2023-05-09T13:24:38"} | {"id":"04deee1a2a9b4d6ea986ffe0fa4017d9" | author:"ben" | summary:"GitLab | the most comprehensive | scalable enterprise DevSecOps platform for software innovation | and Google Cloud today announced an extension of their strategic partnership to deliver secure AI offerings to the enterprise. By leveraging Google Cloud's customizable foundation models and open generative AI infrastructure | GitLab will provide customers with AI-assisted features directly within the enterprise DevSecOps platform. The company's AI capabilities are designed to help enterprises improve productivity and reduce costs." | tags:["#ai-assistedfeatures" | #enterprisedevsecopsplatform | #general | #gitlab | #googlecloud] | date:"2023-05-09T13:24:36"}] |

requirements.txt CHANGED Viewed

@@ -3,3 +3,5 @@ httpx
 uvicorn[standard]
 torch
 transformers

 uvicorn[standard]
 torch
 transformers
+charset-normalizer
+trafilatura

src/gistillery/ml.py CHANGED Viewed

@@ -70,14 +70,17 @@ class HfTransformersTagger(Tagger):
         self.template = (
             "Create a list of tags for the text below. The tags should be high level "
-            "and specific. Prefix each tag with a hashtag.\n\n{}\n\nTags: #general"
         )
     def _extract_tags(self, text: str) -> list[str]:
-        tags = set()
-        for tag in text.split():
-            if tag.startswith("#"):
-                tags.add(tag.lower())
         return sorted(tags)
     def __call__(self, x: str) -> list[str]:

         self.template = (
             "Create a list of tags for the text below. The tags should be high level "
+            "and specific. Return the results as a comma separated list.\n\n"
+            "{}\n\nTags:\n"
         )
     def _extract_tags(self, text: str) -> list[str]:
+        tags = {"#general"}
+        for tag in text.split(","):
+            tag = tag.strip().lower().replace(" ", "")
+            if not tag.startswith("#"):
+                tag = "#" + tag
+            tags.add(tag)
         return sorted(tags)
     def __call__(self, x: str) -> list[str]:

src/gistillery/preprocessing.py CHANGED Viewed

@@ -2,7 +2,8 @@ import abc
 import logging
 import re
-import httpx
 from gistillery.base import JobInput
@@ -39,8 +40,9 @@ class RawTextProcessor(Processor):
 class DefaultUrlProcessor(Processor):
     def __init__(self) -> None:
-        self.client = httpx.Client()
         self.regex = re.compile(r"(https?://[^\s]+)")
         self.url = None
         self.template = "{url}\n\n{content}"
@@ -57,5 +59,6 @@ class DefaultUrlProcessor(Processor):
         assert isinstance(self.url, str)
         text = self.client.get(self.url).text
         assert isinstance(text, str)
-        text = self.template.format(url=self.url, content=text)
         return text

 import logging
 import re
+from httpx import Client
+from trafilatura import extract
 from gistillery.base import JobInput
 class DefaultUrlProcessor(Processor):
+    # uses trafilatura to extract text from html
     def __init__(self) -> None:
+        self.client = Client()
         self.regex = re.compile(r"(https?://[^\s]+)")
         self.url = None
         self.template = "{url}\n\n{content}"
         assert isinstance(self.url, str)
         text = self.client.get(self.url).text
         assert isinstance(text, str)
+        extracted = extract(text)
+        text = self.template.format(url=self.url, content=extracted)
         return text

src/gistillery/registry.py CHANGED Viewed

@@ -12,8 +12,11 @@ class MlRegistry:
         self.model = None
         self.tokenizer = None
-    def register_processor(self, processor: Processor) -> None:
-        self.processors.append(processor)
     def register_summarizer(self, summarizer: Summarizer) -> None:
         self.summerizer = summarizer

         self.model = None
         self.tokenizer = None
+    def register_processor(self, processor: Processor, last: bool = True) -> None:
+        if last:
+            self.processors.append(processor)
+        else:
+            self.processors.insert(0, processor)
     def register_summarizer(self, summarizer: Summarizer) -> None:
         self.summerizer = summarizer

tests/test_app.py CHANGED Viewed

@@ -234,3 +234,65 @@ class TestWebservice:
         rows = cursor.execute("SELECT * FROM inputs").fetchall()
         assert len(rows) == 1
         assert rows[0].input == "this is a test"

         rows = cursor.execute("SELECT * FROM inputs").fetchall()
         assert len(rows) == 1
         assert rows[0].input == "this is a test"
+    def test_submit_url(self, client, cursor, mlregistry, monkeypatch):
+        class MockClient:
+            """Mock httpx Client, return www.example.com content"""
+            def get(self, url):
+                return SimpleNamespace(
+                    text=''' <!doctype html>\n<html>\n<head>\n <title>Example
+                    Domain</title>\n\n <meta charset="utf-8" />\n <meta
+                    http-equiv="Content-type" content="text/html; charset=utf-8"
+                    />\n <meta name="viewport" content="width=device-width,
+                    initial-scale=1" />\n <style type="text/css">\n body {\n
+                    background-color: #f0f0f2;\n margin: 0;\n padding: 0;\n
+                    font-family: -apple-system, system-ui, BlinkMacSystemFont,
+                    "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial,
+                    sans-serif;\n \n }\n div {\n width: 600px;\n margin: 5em
+                    auto;\n padding: 2em;\n background-color: #fdfdff;\n
+                    border-radius: 0.5em;\n box-shadow: 2px 3px 7px 2px
+                    rgba(0,0,0,0.02);\n }\n a:link, a:visited {\n color:
+                    #38488f;\n text-decoration: none;\n }\n @media (max-width:
+                    700px) {\n div {\n margin: 0 auto;\n width: auto;\n }\n }\n
+                    </style> \n</head>\n\n<body>\n<div>\n <h1>Example
+                    Domain</h1>\n <p>This domain is for use in illustrative
+                    examples in documents. You may use this\n domain in
+                    literature without prior coordination or asking for
+                    permission.</p>\n <p><a
+                    href="https://www.iana.org/domains/example">More
+                    information...</a></p>\n</div>\n</body>\n</html>\n'''
+                )
+        monkeypatch.setattr("gistillery.preprocessing.Client", MockClient)
+        from gistillery.preprocessing import DefaultUrlProcessor
+        # register url processor, put it before the default processor
+        mlregistry.register_processor(DefaultUrlProcessor(), last=False)
+        client.post(
+            "/submit",
+            json={
+                "author": "ben",
+                "content": "https://en.wikipedia.org/wiki/non-existing-page",
+            },
+        )
+        self.process_jobs(mlregistry)
+        rows = cursor.execute("SELECT * FROM inputs").fetchall()
+        assert len(rows) == 1
+        expected = "\n".join(
+            [
+                'https://en.wikipedia.org/wiki/non-existing-page',
+                '',
+                'This domain is for use in illustrative',
+                'examples in documents. You may use this',
+                'domain in',
+                'literature without prior coordination or asking for',
+                'permission.',
+                'More',
+                'information...',
+            ]
+        )
+        assert rows[0].input == expected