import os import time import json from crawl4ai.web_crawler import WebCrawler from crawl4ai.chunking_strategy import * from crawl4ai.extraction_strategy import * from crawl4ai.crawler_strategy import * url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot' crawler = WebCrawler() crawler.warmup() from pydantic import BaseModel, Field class PageSummary(BaseModel): title: str = Field(..., description="Title of the page.") summary: str = Field(..., description="Summary of the page.") brief_summary: str = Field(..., description="Brief summary of the page.") keywords: list = Field(..., description="Keywords assigned to the page.") result = crawler.run( url=url, word_count_threshold=1, extraction_strategy= LLMExtractionStrategy( provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), schema=PageSummary.model_json_schema(), extraction_type="schema", apply_chunking =False, instruction="From the crawled content, extract the following details: "\ "1. Title of the page "\ "2. Summary of the page, which is a detailed summary "\ "3. Brief summary of the page, which is a paragraph text "\ "4. Keywords assigned to the page, which is a list of keywords. "\ 'The extracted JSON format should look like this: '\ '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }' ), bypass_cache=True, ) page_summary = json.loads(result.extracted_content) print(page_summary) with open(".data/page_summary.json", "w", encoding="utf-8") as f: f.write(result.extracted_content)