Spaces:

kenken999
/

fastapi_django_main_live

Sleeping

App Files Files Community

kenken999 commited on Jun 16, 2024

Commit

25e1c5e

1 Parent(s): 8545387

gpt

Browse files

Files changed (2) hide show

README.md +1 -0
polls/crawl.py +71 -0

README.md CHANGED Viewed

@@ -16,3 +16,4 @@ export OPENAI_API_KEY="*************"
 export MODEL_NAME="llama3-8b-8192"
 export LOCAL_MODEL=true
 for interpreter

 export MODEL_NAME="llama3-8b-8192"
 export LOCAL_MODEL=true
 for interpreter
+https://zenn.dev/singularity/articles/231adc99e78a3d

polls/crawl.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import requests
+from bs4 import BeautifulSoup
+import json
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+import time
+import re
+import os
+# URL of the your website
+url = 'https://XXX.com'
+# Set Chrome options to enable headless mode
+chrome_options = Options()
+chrome_options.add_argument('--headless')
+chrome_options.add_argument('--no-sandbox')
+chrome_options.add_argument('--disable-dev-shm-usage')
+# Set the path to chromedriver executable
+chromedriver_path = '/usr/local/bin/chromedriver'
+# Create a new Chrome instance
+driver = webdriver.Chrome(options=chrome_options)
+# Load the website
+driver.get(url)
+# Wait for the page to fully render
+time.sleep(5)
+# Extract the rendered HTML
+html = driver.page_source
+# Close the Chrome instance
+driver.quit()
+# Parse the HTML using BeautifulSoup
+soup = BeautifulSoup(html, 'html.parser')
+data = {}
+# h1〜h4のタグを取得
+headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5'])
+(h1_text,h2_text,h3_text,h4_text,h5_text)=("","","","","")
+for heading in headings:
+    if heading.name == 'h1':
+        h1_text = heading.text
+        key = h1_text
+    elif heading.name == 'h2':
+        h2_text = heading.text
+        key = f"{h1_text}-{h2_text}"
+    elif heading.name == 'h3':
+        h3_text = heading.text
+        key = f"{h1_text}-{h2_text}-{h3_text}"
+    elif heading.name == 'h4':
+        h4_text = heading.text
+        key = f"{h1_text}-{h2_text}-{h3_text}-{h4_text}"
+    elif heading.name == 'h5':
+        h5_text = heading.text
+        key = f"{h1_text}-{h2_text}-{h3_text}-{h5_text}"
+    # 次の要素のテキストを取得
+    sibling = heading.find_next_sibling()
+    value = ''
+    while sibling and not sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5']:
+        value += sibling.text
+        sibling = sibling.find_next_sibling()
+    data[key] = value.strip()
+print(len(data),(data.keys()))