Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

minko186 commited on Sep 27, 2024

Commit

9a9aac4

1 Parent(s): a6fbfb6

initial commit

Browse files

Files changed (1) hide show

google_search.py +23 -19

google_search.py CHANGED Viewed

@@ -16,6 +16,7 @@ CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
 # Number of pages to scrape
 NUM_PAGES = 20
 def build_results_beautifulsoup(url_list):
     print("Starting to scrape URLs...")
     start_time = time.perf_counter()
@@ -37,7 +38,7 @@ def build_results_beautifulsoup(url_list):
         if soup:
             print(f"Processing URL: {url}")
             text = extract(
                 soup,
                 include_tables=False,
@@ -143,8 +144,8 @@ async def extract_pdf_text(content):
         </body>
         </html>
         """
-        html_bytes = html_content.encode('utf-8')
-        return html_bytes # Return in such a format that is parsable by trafilatura
     except Exception as e:
         print(f"Error extracting PDF text: {str(e)}")
         return None
@@ -183,39 +184,41 @@ def google_search_urls(
 ):
     if skip_urls is None:
         skip_urls = []  # Initialize as empty list if not provided
     service = build("customsearch", "v1", developerKey=api_key)
     url_list = []
     start_index = 1  # Initial index for the search results
     while len(url_list) < total_results:
         # Fetch a page of results
-        results = service.cse().list(
-            q=text,
-            cx=cse_id,
-            sort=sorted_date,
-            start=start_index,
-            num=min(num_results, total_results - len(url_list)),
-            **kwargs
-        ).execute()
         if "items" in results and len(results["items"]) > 0:
             for count, link in enumerate(results["items"]):
                 url = link["link"]
                 # Skip if the URL is in the skip_urls list or doesn't match the domain filter
                 if url in skip_urls:
                     continue
-                if (domains_to_include is None) or any(
-                    ("." + domain) in url for domain in domains_to_include
-                ):
                     if url not in url_list:
                         url_list.append(url)
         else:
             # No more results
             break
         # Move to the next page of results
         start_index += num_results
     return url_list[:total_results]
@@ -240,7 +243,8 @@ def google_search(topic, sorted_date, domains_to_include, scholar_mode_check):
     result_content = build_results_beautifulsoup(url_list)
     return result_content
 if __name__ == "__main__":
     res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
     print(res.keys())
-    print(len(res))

 # Number of pages to scrape
 NUM_PAGES = 20
 def build_results_beautifulsoup(url_list):
     print("Starting to scrape URLs...")
     start_time = time.perf_counter()
         if soup:
             print(f"Processing URL: {url}")
             text = extract(
                 soup,
                 include_tables=False,
         </body>
         </html>
         """
+        html_bytes = html_content.encode("utf-8")
+        return html_bytes  # Return in such a format that is parsable by trafilatura
     except Exception as e:
         print(f"Error extracting PDF text: {str(e)}")
         return None
 ):
     if skip_urls is None:
         skip_urls = []  # Initialize as empty list if not provided
     service = build("customsearch", "v1", developerKey=api_key)
     url_list = []
     start_index = 1  # Initial index for the search results
     while len(url_list) < total_results:
         # Fetch a page of results
+        results = (
+            service.cse()
+            .list(
+                q=text,
+                cx=cse_id,
+                sort=sorted_date,
+                start=start_index,
+                num=min(num_results, total_results - len(url_list)),
+                **kwargs,
+            )
+            .execute()
+        )
         if "items" in results and len(results["items"]) > 0:
             for count, link in enumerate(results["items"]):
                 url = link["link"]
                 # Skip if the URL is in the skip_urls list or doesn't match the domain filter
                 if url in skip_urls:
                     continue
+                if (domains_to_include is None) or any(("." + domain) in url for domain in domains_to_include):
                     if url not in url_list:
                         url_list.append(url)
         else:
             # No more results
             break
         # Move to the next page of results
         start_index += num_results
     return url_list[:total_results]
     result_content = build_results_beautifulsoup(url_list)
     return result_content
 if __name__ == "__main__":
     res = google_search("Low Resource ", "date:r:20240101:20241231", domain_list, False)
     print(res.keys())
+    print(len(res))