diff --git a/.env.txt b/.env.txt new file mode 100644 index 0000000000000000000000000000000000000000..07d266c80fe69d6f172d8000e63ac066b8f3932e --- /dev/null +++ b/.env.txt @@ -0,0 +1,4 @@ +GROQ_API_KEY = "YOUR_GROQ_API" +OPENAI_API_KEY = "YOUR_OPENAI_API" +ANTHROPIC_API_KEY = "YOUR_ANTHROPIC_API" +# You can add more API keys here \ No newline at end of file diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..144fe136ea6389f5062bfba403ba11fc8d82a218 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,35 +1,12 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text +# Documentation +*.html linguist-documentation +docs/* linguist-documentation +docs/examples/* linguist-documentation +docs/md_v2/* linguist-documentation + +# Explicitly mark Python as the main language +*.py linguist-detectable=true +*.py linguist-language=Python + +# Exclude HTML from language statistics +*.html linguist-detectable=false \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..b28b377ae9199fa559f40d986e0c4439f9a886ce --- /dev/null +++ b/.gitignore @@ -0,0 +1,232 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +Crawl4AI.egg-info/ +Crawl4AI.egg-info/* +crawler_data.db +.vscode/ +.tests/ +.test_pads/ +test_pad.py +test_pad*.py +.data/ +Crawl4AI.egg-info/ + +requirements0.txt +a.txt + +*.sh +.idea +docs/examples/.chainlit/ +docs/examples/.chainlit/* +.chainlit/config.toml +.chainlit/translations/en-US.json + +local/ +.files/ + +a.txt +.lambda_function.py +ec2* + +update_changelog.sh + +.DS_Store +docs/.DS_Store +tmp/ +test_env/ +**/.DS_Store +**/.DS_Store + +todo.md +todo_executor.md +git_changes.py +git_changes.md +pypi_build.sh +git_issues.py +git_issues.md + +.next/ +.tests/ +# .issues/ +.docs/ +.issues/ +.gitboss/ +todo_executor.md +protect-all-except-feature.sh +manage-collab.sh +publish.sh +combine.sh +combined_output.txt +.local +.scripts +tree.md +tree.md +.scripts +.local +.do +/plans +plans/ + +# Codeium +.codeiumignore \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000000000000000000000000000000000000..afa841c9f79d6bd04153c182999270d38e219c79 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,1089 @@ +# Changelog + +All notable changes to Crawl4AI will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +--- + +## [0.4.267] - 2025 - 01 - 06 + +### Added +- **Windows Event Loop Configuration**: Introduced a utility function `configure_windows_event_loop` to resolve `NotImplementedError` for asyncio subprocesses on Windows. ([#utils.py](crawl4ai/utils.py), [#tutorials/async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md)) +- **`page_need_scroll` Method**: Added a method to determine if a page requires scrolling before taking actions in `AsyncPlaywrightCrawlerStrategy`. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py)) + +### Changed +- **Version Bump**: Updated the version from `0.4.246` to `0.4.247`. ([#__version__.py](crawl4ai/__version__.py)) +- **Improved Scrolling Logic**: Enhanced scrolling methods in `AsyncPlaywrightCrawlerStrategy` by adding a `scroll_delay` parameter for better control. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py)) +- **Markdown Generation Example**: Updated the `hello_world.py` example to reflect the latest API changes and better illustrate features. ([#examples/hello_world.py](docs/examples/hello_world.py)) +- **Documentation Update**: + - Added Windows-specific instructions for handling asyncio event loops. ([#async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md)) + +### Removed +- **Legacy Markdown Generation Code**: Removed outdated and unused code for markdown generation in `content_scraping_strategy.py`. ([#content_scraping_strategy.py](crawl4ai/content_scraping_strategy.py)) + +### Fixed +- **Page Closing to Prevent Memory Leaks**: + - **Description**: Added a `finally` block to ensure pages are closed when no `session_id` is provided. + - **Impact**: Prevents memory leaks caused by lingering pages after a crawl. + - **File**: [`async_crawler_strategy.py`](crawl4ai/async_crawler_strategy.py) + - **Code**: + ```python + finally: + # If no session_id is given we should close the page + if not config.session_id: + await page.close() + ``` +- **Multiple Element Selection**: Modified `_get_elements` in `JsonCssExtractionStrategy` to return all matching elements instead of just the first one, ensuring comprehensive extraction. ([#extraction_strategy.py](crawl4ai/extraction_strategy.py)) +- **Error Handling in Scrolling**: Added robust error handling to ensure scrolling proceeds safely even if a configuration is missing. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py)) + +### Other +- **Git Ignore Update**: Added `/plans` to `.gitignore` for better development environment consistency. ([#.gitignore](.gitignore)) + + +## [0.4.24] - 2024-12-31 + +### Added +- **Browser and SSL Handling** + - SSL certificate validation options in extraction strategies + - Custom certificate paths support + - Configurable certificate validation skipping + - Enhanced response status code handling with retry logic + +- **Content Processing** + - New content filtering system with regex support + - Advanced chunking strategies for large content + - Memory-efficient parallel processing + - Configurable chunk size optimization + +- **JSON Extraction** + - Complex JSONPath expression support + - JSON-CSS and Microdata extraction + - RDFa parsing capabilities + - Advanced data transformation pipeline + +- **Field Types** + - New field types: `computed`, `conditional`, `aggregate`, `template` + - Field inheritance system + - Reusable field definitions + - Custom validation rules + +### Changed +- **Performance** + - Optimized selector compilation with caching + - Improved HTML parsing efficiency + - Enhanced memory management for large documents + - Batch processing optimizations + +- **Error Handling** + - More detailed error messages and categorization + - Enhanced debugging capabilities + - Improved performance metrics tracking + - Better error recovery mechanisms + +### Deprecated +- Old field computation method using `eval` +- Direct browser manipulation without proper SSL handling +- Simple text-based content filtering + +### Removed +- Legacy extraction patterns without proper error handling +- Unsafe eval-based field computation +- Direct DOM manipulation without sanitization + +### Fixed +- Memory leaks in large document processing +- SSL certificate validation issues +- Incorrect handling of nested JSON structures +- Performance bottlenecks in parallel processing + +### Security +- Improved input validation and sanitization +- Safe expression evaluation system +- Enhanced resource protection +- Rate limiting implementation + +## [0.4.1] - 2024-12-08 + +### **File: `crawl4ai/async_crawler_strategy.py`** + +#### **New Parameters and Attributes Added** +- **`text_mode` (boolean)**: Enables text-only mode, disables images, JavaScript, and GPU-related features for faster, minimal rendering. +- **`light_mode` (boolean)**: Optimizes the browser by disabling unnecessary background processes and features for efficiency. +- **`viewport_width` and `viewport_height`**: Dynamically adjusts based on `text_mode` mode (default values: 800x600 for `text_mode`, 1920x1080 otherwise). +- **`extra_args`**: Adds browser-specific flags for `text_mode` mode. +- **`adjust_viewport_to_content`**: Dynamically adjusts the viewport to the content size for accurate rendering. + +#### **Browser Context Adjustments** +- Added **`viewport` adjustments**: Dynamically computed based on `text_mode` or custom configuration. +- Enhanced support for `light_mode` and `text_mode` by adding specific browser arguments to reduce resource consumption. + +#### **Dynamic Content Handling** +- **Full Page Scan Feature**: + - Scrolls through the entire page while dynamically detecting content changes. + - Ensures scrolling stops when no new dynamic content is loaded. + +#### **Session Management** +- Added **`create_session`** method: + - Creates a new browser session and assigns a unique ID. + - Supports persistent and non-persistent contexts with full compatibility for cookies, headers, and proxies. + +#### **Improved Content Loading and Adjustment** +- **`adjust_viewport_to_content`**: + - Automatically adjusts viewport to match content dimensions. + - Includes scaling via Chrome DevTools Protocol (CDP). +- Enhanced content loading: + - Waits for images to load and ensures network activity is idle before proceeding. + +#### **Error Handling and Logging** +- Improved error handling and detailed logging for: + - Viewport adjustment (`adjust_viewport_to_content`). + - Full page scanning (`scan_full_page`). + - Dynamic content loading. + +#### **Refactoring and Cleanup** +- Removed hardcoded viewport dimensions in multiple places, replaced with dynamic values (`self.viewport_width`, `self.viewport_height`). +- Removed commented-out and unused code for better readability. +- Added default value for `delay_before_return_html` parameter. + +#### **Optimizations** +- Reduced resource usage in `light_mode` by disabling unnecessary browser features such as extensions, background timers, and sync. +- Improved compatibility for different browser types (`chrome`, `firefox`, `webkit`). + +--- + +### **File: `docs/examples/quickstart_async.py`** + +#### **Schema Adjustment** +- Changed schema reference for `LLMExtractionStrategy`: + - **Old**: `OpenAIModelFee.schema()` + - **New**: `OpenAIModelFee.model_json_schema()` + - This likely ensures better compatibility with the `OpenAIModelFee` class and its JSON schema. + +#### **Documentation Comments Updated** +- Improved extraction instruction for schema-based LLM strategies. + +--- + +### **New Features Added** +1. **Text-Only Mode**: + - Focuses on minimal resource usage by disabling non-essential browser features. +2. **Light Mode**: + - Optimizes browser for performance by disabling background tasks and unnecessary services. +3. **Full Page Scanning**: + - Ensures the entire content of a page is crawled, including dynamic elements loaded during scrolling. +4. **Dynamic Viewport Adjustment**: + - Automatically resizes the viewport to match content dimensions, improving compatibility and rendering accuracy. +5. **Session Management**: + - Simplifies session handling with better support for persistent and non-persistent contexts. + +--- + +### **Bug Fixes** +- Fixed potential viewport mismatches by ensuring consistent use of `self.viewport_width` and `self.viewport_height` throughout the code. +- Improved robustness of dynamic content loading to avoid timeouts and failed evaluations. + + + + + + + +## [0.3.75] December 1, 2024 + +### PruningContentFilter + +#### 1. Introduced PruningContentFilter (Dec 01, 2024) (Dec 01, 2024) +A new content filtering strategy that removes less relevant nodes based on metrics like text and link density. + +**Affected Files:** +- `crawl4ai/content_filter_strategy.py`: Enhancement of content filtering capabilities. +```diff +Implemented effective pruning algorithm with comprehensive scoring. +``` +- `README.md`: Improved documentation regarding new features. +```diff +Updated to include usage and explanation for the PruningContentFilter. +``` +- `docs/md_v2/basic/content_filtering.md`: Expanded documentation for users. +```diff +Added detailed section explaining the PruningContentFilter. +``` + +#### 2. Added Unit Tests for PruningContentFilter (Dec 01, 2024) (Dec 01, 2024) +Comprehensive tests added to ensure correct functionality of PruningContentFilter + +**Affected Files:** +- `tests/async/test_content_filter_prune.py`: Increased test coverage for content filtering strategies. +```diff +Created test cases for various scenarios using the PruningContentFilter. +``` + +### Development Updates + +#### 3. Enhanced BM25ContentFilter tests (Dec 01, 2024) (Dec 01, 2024) +Extended testing to cover additional edge cases and performance metrics. + +**Affected Files:** +- `tests/async/test_content_filter_bm25.py`: Improved reliability and performance assurance. +```diff +Added tests for new extraction scenarios including malformed HTML. +``` + +### Infrastructure & Documentation + +#### 4. Updated Examples (Dec 01, 2024) (Dec 01, 2024) +Altered examples in documentation to promote the use of PruningContentFilter alongside existing strategies. + +**Affected Files:** +- `docs/examples/quickstart_async.py`: Enhanced usability and clarity for new users. +- Revised example to illustrate usage of PruningContentFilter. + +## [0.3.746] November 29, 2024 + +### Major Features +1. Enhanced Docker Support (Nov 29, 2024) + - Improved GPU support in Docker images. + - Dockerfile refactored for better platform-specific installations. + - Introduced new Docker commands for different platforms: + - `basic-amd64`, `all-amd64`, `gpu-amd64` for AMD64. + - `basic-arm64`, `all-arm64`, `gpu-arm64` for ARM64. + +### Infrastructure & Documentation +- Enhanced README.md to improve user guidance and installation instructions. +- Added installation instructions for Playwright setup in README. +- Created and updated examples in `docs/examples/quickstart_async.py` to be more useful and user-friendly. +- Updated `requirements.txt` with a new `pydantic` dependency. +- Bumped version number in `crawl4ai/__version__.py` to 0.3.746. + +### Breaking Changes +- Streamlined application structure: + - Removed static pages and related code from `main.py` which might affect existing deployments relying on static content. + +### Development Updates +- Developed `post_install` method in `crawl4ai/install.py` to streamline post-installation setup tasks. +- Refined migration processes in `crawl4ai/migrations.py` with enhanced logging for better error visibility. +- Updated `docker-compose.yml` to support local and hub services for different architectures, enhancing build and deploy capabilities. +- Refactored example test cases in `docs/examples/docker_example.py` to facilitate comprehensive testing. + +### README.md +Updated README with new docker commands and setup instructions. +Enhanced installation instructions and guidance. + +### crawl4ai/install.py +Added post-install script functionality. +Introduced `post_install` method for automation of post-installation tasks. + +### crawl4ai/migrations.py +Improved migration logging. +Refined migration processes and added better logging. + +### docker-compose.yml +Refactored docker-compose for better service management. +Updated to define services for different platforms and versions. + +### requirements.txt +Updated dependencies. +Added `pydantic` to requirements file. + +### crawler/__version__.py +Updated version number. +Bumped version number to 0.3.746. + +### docs/examples/quickstart_async.py +Enhanced example scripts. +Uncommented example usage in async guide for user functionality. + +### main.py +Refactored code to improve maintainability. +Streamlined app structure by removing static pages code. + +## [0.3.743] November 27, 2024 + +Enhance features and documentation +- Updated version to 0.3.743 +- Improved ManagedBrowser configuration with dynamic host/port +- Implemented fast HTML formatting in web crawler +- Enhanced markdown generation with a new generator class +- Improved sanitization and utility functions +- Added contributor details and pull request acknowledgments +- Updated documentation for clearer usage scenarios +- Adjusted tests to reflect class name changes + +### CONTRIBUTORS.md +Added new contributors and pull request details. +Updated community contributions and acknowledged pull requests. + +### crawl4ai/__version__.py +Version update. +Bumped version to 0.3.743. + +### crawl4ai/async_crawler_strategy.py +Improved ManagedBrowser configuration. +Enhanced browser initialization with configurable host and debugging port; improved hook execution. + +### crawl4ai/async_webcrawler.py +Optimized HTML processing. +Implemented 'fast_format_html' for optimized HTML formatting; applied it when 'prettiify' is enabled. + +### crawl4ai/content_scraping_strategy.py +Enhanced markdown generation strategy. +Updated to use DefaultMarkdownGenerator and improved markdown generation with filters option. + +### crawl4ai/markdown_generation_strategy.py +Refactored markdown generation class. +Renamed DefaultMarkdownGenerationStrategy to DefaultMarkdownGenerator; added content filter handling. + +### crawl4ai/utils.py +Enhanced utility functions. +Improved input sanitization and enhanced HTML formatting method. + +### docs/md_v2/advanced/hooks-auth.md +Improved documentation for hooks. +Updated code examples to include cookies in crawler strategy initialization. + +### tests/async/test_markdown_genertor.py +Refactored tests to match class renaming. +Updated tests to use renamed DefaultMarkdownGenerator class. + +## [0.3.74] November 17, 2024 + +This changelog details the updates and changes introduced in Crawl4AI version 0.3.74. It's designed to inform developers about new features, modifications to existing components, removals, and other important information. + +### 1. File Download Processing + +- Users can now specify download folders using the `downloads_path` parameter in the `AsyncWebCrawler` constructor or the `arun` method. If not specified, downloads are saved to a "downloads" folder within the `.crawl4ai` directory. +- File download tracking is integrated into the `CrawlResult` object. Successfully downloaded files are listed in the `downloaded_files` attribute, providing their paths. +- Added `accept_downloads` parameter to the crawler strategies (defaults to `False`). If set to True you can add JS code and `wait_for` parameter for file download. + +**Example:** + +```python +import asyncio +import os +from pathlib import Path +from crawl4ai import AsyncWebCrawler + +async def download_example(): + downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads") + os.makedirs(downloads_path, exist_ok=True) + + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=downloads_path, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) { downloadLink.click(); } + """, + wait_for=5 # To ensure download has started + ) + + if result.downloaded_files: + print("Downloaded files:") + for file in result.downloaded_files: + print(f"- {file}") + +asyncio.run(download_example()) + +``` + +### 2. Refined Content Filtering + +- Introduced the `RelevanceContentFilter` strategy (and its implementation `BM25ContentFilter`) for extracting relevant content from web pages, replacing Fit Markdown and other content cleaning strategy. This new strategy leverages the BM25 algorithm to identify chunks of text relevant to the page's title, description, keywords, or a user-provided query. +- The `fit_markdown` flag in the content scraper is used to filter content based on title, meta description, and keywords. + +**Example:** + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import BM25ContentFilter + +async def filter_content(url, query): + async with AsyncWebCrawler() as crawler: + content_filter = BM25ContentFilter(user_query=query) + result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) + print(result.extracted_content) # Or result.fit_markdown for the markdown version + print(result.fit_html) # Or result.fit_html to show HTML with only the filtered content + +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) +``` + +### 3. Raw HTML and Local File Support + +- Added support for crawling local files and raw HTML content directly. +- Use the `file://` prefix for local file paths. +- Use the `raw:` prefix for raw HTML strings. + +**Example:** + +```python +async def crawl_local_or_raw(crawler, content, content_type): + prefix = "file://" if content_type == "local" else "raw:" + url = f"{prefix}{content}" + result = await crawler.arun(url=url) + if result.success: + print(f"Markdown Content from {content_type.title()} Source:") + print(result.markdown) + +# Example usage with local file and raw HTML +async def main(): + async with AsyncWebCrawler() as crawler: + # Local File + await crawl_local_or_raw( + crawler, os.path.abspath('tests/async/sample_wikipedia.html'), "local" + ) + # Raw HTML + await crawl_raw_html(crawler, "

Raw Test

This is raw HTML.

") + + +asyncio.run(main()) +``` + +### 4. Browser Management + +- New asynchronous crawler strategy implemented using Playwright. +- `ManagedBrowser` class introduced for improved browser session handling, offering features like persistent browser sessions between requests (using `session_id` parameter) and browser process monitoring. +- Updated to tf-playwright-stealth for enhanced stealth capabilities. +- Added `use_managed_browser`, `use_persistent_context`, and `chrome_channel` parameters to AsyncPlaywrightCrawlerStrategy. + + +**Example:** +```python +async def browser_management_demo(): + user_data_dir = os.path.join(Path.home(), ".crawl4ai", "user-data-dir") + os.makedirs(user_data_dir, exist_ok=True) # Ensure directory exists + async with AsyncWebCrawler( + use_managed_browser=True, + user_data_dir=user_data_dir, + use_persistent_context=True, + verbose=True + ) as crawler: + result1 = await crawler.arun( + url="https://example.com", session_id="my_session" + ) + result2 = await crawler.arun( + url="https://example.com/anotherpage", session_id="my_session" + ) + +asyncio.run(browser_management_demo()) +``` + + +### 5. API Server & Cache Improvements + +- Added CORS support to API server. +- Implemented static file serving. +- Enhanced root redirect functionality. +- Cache database updated to store response headers and downloaded files information. It utilizes a file system approach to manage large content efficiently. +- New, more efficient caching database built using xxhash and file system approach. +- Introduced `CacheMode` enum (`ENABLED`, `DISABLED`, `READ_ONLY`, `WRITE_ONLY`, `BYPASS`) and `always_bypass_cache` parameter in AsyncWebCrawler for fine-grained cache control. This replaces `bypass_cache`, `no_cache_read`, `no_cache_write`, and `always_by_pass_cache`. + + +### 🗑️ Removals + +- Removed deprecated: `crawl4ai/content_cleaning_strategy.py`. +- Removed internal class ContentCleaningStrategy +- Removed legacy cache control flags: `bypass_cache`, `disable_cache`, `no_cache_read`, `no_cache_write`, and `always_by_pass_cache`. These have been superseded by `cache_mode`. + + +### ⚙️ Other Changes + +- Moved version file to `crawl4ai/__version__.py`. +- Added `crawl4ai/cache_context.py`. +- Added `crawl4ai/version_manager.py`. +- Added `crawl4ai/migrations.py`. +- Added `crawl4ai-migrate` entry point. +- Added config `NEED_MIGRATION` and `SHOW_DEPRECATION_WARNINGS`. +- API server now requires an API token for authentication, configurable with the `CRAWL4AI_API_TOKEN` environment variable. This enhances API security. +- Added synchronous crawl endpoint `/crawl_sync` for immediate result retrieval, and direct crawl endpoint `/crawl_direct` bypassing the task queue. + + +### ⚠️ Deprecation Notices + +- The synchronous version of `WebCrawler` is being phased out. While still available via `crawl4ai[sync]`, it will eventually be removed. Transition to `AsyncWebCrawler` is strongly recommended. Boolean cache control flags in `arun` are also deprecated, migrate to using the `cache_mode` parameter. See examples in the "New Features" section above for correct usage. + + +### 🐛 Bug Fixes + +- Resolved issue with browser context closing unexpectedly in Docker. This significantly improves stability, particularly within containerized environments. +- Fixed memory leaks associated with incorrect asynchronous cleanup by removing the `__del__` method and ensuring the browser context is closed explicitly using context managers. +- Improved error handling in `WebScrapingStrategy`. More detailed error messages and suggestions for debugging will minimize frustration when running into unexpected issues. +- Fixed issue with incorrect text parsing in specific HTML structures. + + +### Example of migrating to the new CacheMode: + +**Old way:** + +```python +crawler = AsyncWebCrawler(always_by_pass_cache=True) +result = await crawler.arun(url="https://example.com", bypass_cache=True) +``` + +**New way:** + +```python +from crawl4ai import CacheMode + +crawler = AsyncWebCrawler(always_bypass_cache=True) +result = await crawler.arun(url="https://example.com", cache_mode=CacheMode.BYPASS) +``` + + +## [0.3.74] - November 13, 2024 + +1. **File Download Processing** (Nov 14, 2024) + - Added capability for users to specify download folders + - Implemented file download tracking in crowd result object + - Created new file: `tests/async/test_async_doanloader.py` + +2. **Content Filtering Improvements** (Nov 14, 2024) + - Introduced Relevance Content Filter as an improvement over Fit Markdown + - Implemented BM25 algorithm for content relevance matching + - Added new file: `crawl4ai/content_filter_strategy.py` + - Removed deprecated: `crawl4ai/content_cleaning_strategy.py` + +3. **Local File and Raw HTML Support** (Nov 13, 2024) + - Added support for processing local files + - Implemented raw HTML input handling in AsyncWebCrawler + - Enhanced `crawl4ai/async_webcrawler.py` with significant performance improvements + +4. **Browser Management Enhancements** (Nov 12, 2024) + - Implemented new async crawler strategy using Playwright + - Introduced ManagedBrowser for better browser session handling + - Added support for persistent browser sessions + - Updated from playwright_stealth to tf-playwright-stealth + +5. **API Server Component** + - Added CORS support + - Implemented static file serving + - Enhanced root redirect functionality + + + +## [0.3.731] - November 13, 2024 + +### Added +- Support for raw HTML and local file crawling via URL prefixes ('raw:', 'file://') +- Browser process monitoring for managed browser instances +- Screenshot capability for raw HTML and local file content +- Response headers storage in cache database +- New `fit_markdown` flag for optional markdown generation + +### Changed +- Switched HTML parser from 'html.parser' to 'lxml' for ~4x performance improvement +- Optimized BeautifulSoup text conversion and element selection +- Pre-compiled regular expressions for better performance +- Improved metadata extraction efficiency +- Response headers now stored alongside HTML in cache + +### Removed +- `__del__` method from AsyncPlaywrightCrawlerStrategy to prevent async cleanup issues + +### Fixed +- Issue #256: Added support for crawling raw HTML content +- Issue #253: Implemented file:// protocol handling +- Missing response headers in cached results +- Memory leaks from improper async cleanup + +## [v0.3.731] - 2024-11-13 Changelog for Issue 256 Fix +- Fixed: Browser context unexpectedly closing in Docker environment during crawl operations. +- Removed: __del__ method from AsyncPlaywrightCrawlerStrategy to prevent unreliable asynchronous cleanup, ensuring - browser context is closed explicitly within context managers. +- Added: Monitoring for ManagedBrowser subprocess to detect and log unexpected terminations. +- Updated: Dockerfile configurations to expose debugging port (9222) and allocate additional shared memory for improved browser stability. +- Improved: Error handling and resource cleanup processes for browser lifecycle management within the Docker environment. + +## [v0.3.73] - 2024-11-05 + +### Major Features +- **New Doctor Feature** + - Added comprehensive system diagnostics tool + - Available through package hub and CLI + - Provides automated troubleshooting and system health checks + - Includes detailed reporting of configuration issues + +- **Dockerized API Server** + - Released complete Docker implementation for API server + - Added comprehensive documentation for Docker deployment + - Implemented container communication protocols + - Added environment configuration guides + +- **Managed Browser Integration** + - Added support for user-controlled browser instances + - Implemented `ManagedBrowser` class for better browser lifecycle management + - Added ability to connect to existing Chrome DevTools Protocol (CDP) endpoints + - Introduced user data directory support for persistent browser profiles + +- **Enhanced HTML Processing** + - Added HTML tag preservation feature during markdown conversion + - Introduced configurable tag preservation system + - Improved pre-tag and code block handling + - Added support for nested preserved tags with attribute retention + +### Improvements +- **Browser Handling** + - Added flag to ignore body visibility for problematic pages + - Improved browser process cleanup and management + - Enhanced temporary directory handling for browser profiles + - Added configurable browser launch arguments + +- **Database Management** + - Implemented connection pooling for better performance + - Added retry logic for database operations + - Improved error handling and logging + - Enhanced cleanup procedures for database connections + +- **Resource Management** + - Added memory and CPU monitoring + - Implemented dynamic task slot allocation based on system resources + - Added configurable cleanup intervals + +### Technical Improvements +- **Code Structure** + - Moved version management to dedicated _version.py file + - Improved error handling throughout the codebase + - Enhanced logging system with better error reporting + - Reorganized core components for better maintainability + +### Bug Fixes +- Fixed issues with browser process termination +- Improved handling of connection timeouts +- Enhanced error recovery in database operations +- Fixed memory leaks in long-running processes + +### Dependencies +- Updated Playwright to v1.47 +- Updated core dependencies with more flexible version constraints +- Added new development dependencies for testing + +### Breaking Changes +- Changed default browser handling behavior +- Modified database connection management approach +- Updated API response structure for better consistency + +### Migration Guide +When upgrading to v0.3.73, be aware of the following changes: + +1. Docker Deployment: + - Review Docker documentation for new deployment options + - Update environment configurations as needed + - Check container communication settings + +2. If using custom browser management: + - Update browser initialization code to use new ManagedBrowser class + - Review browser cleanup procedures + +3. For database operations: + - Check custom database queries for compatibility with new connection pooling + - Update error handling to work with new retry logic + +4. Using the Doctor: + - Run doctor command for system diagnostics: `crawl4ai doctor` + - Review generated reports for potential issues + - Follow recommended fixes for any identified problems + + +## [v0.3.73] - 2024-11-04 +This commit introduces several key enhancements, including improved error handling and robust database operations in `async_database.py`, which now features a connection pool and retry logic for better reliability. Updates to the README.md provide clearer instructions and a better user experience with links to documentation sections. The `.gitignore` file has been refined to include additional directories, while the async web crawler now utilizes a managed browser for more efficient crawling. Furthermore, multiple dependency updates and introduction of the `CustomHTML2Text` class enhance text extraction capabilities. + +## [v0.3.73] - 2024-10-24 + +### Added +- preserve_tags: Added support for preserving specific HTML tags during markdown conversion. +- Smart overlay removal system in AsyncPlaywrightCrawlerStrategy: + - Automatic removal of popups, modals, and cookie notices + - Detection and removal of fixed/sticky position elements + - Cleaning of empty block elements + - Configurable via `remove_overlay_elements` parameter +- Enhanced screenshot capabilities: + - Added `screenshot_wait_for` parameter to control timing + - Improved screenshot handling with existing page context + - Better error handling with fallback error images +- New URL normalization utilities: + - `normalize_url` function for consistent URL formatting + - `is_external_url` function for better link classification +- Custom base directory support for cache storage: + - New `base_directory` parameter in AsyncWebCrawler + - Allows specifying alternative locations for `.crawl4ai` folder + +### Enhanced +- Link handling improvements: + - Better duplicate link detection + - Enhanced internal/external link classification + - Improved handling of special URL protocols + - Support for anchor links and protocol-relative URLs +- Configuration refinements: + - Streamlined social media domain list + - More focused external content filtering +- LLM extraction strategy: + - Added support for separate API base URL via `api_base` parameter + - Better handling of base URLs in configuration + +### Fixed +- Screenshot functionality: + - Resolved issues with screenshot timing and context + - Improved error handling and recovery +- Link processing: + - Fixed URL normalization edge cases + - Better handling of invalid URLs + - Improved error messages for link processing failures + +### Developer Notes +- The overlay removal system uses advanced JavaScript injection for better compatibility +- URL normalization handles special cases like mailto:, tel:, and protocol-relative URLs +- Screenshot system now reuses existing page context for better performance +- Link processing maintains separate dictionaries for internal and external links to ensure uniqueness + +## [v0.3.72] - 2024-10-22 + +### Added +- New `ContentCleaningStrategy` class: + - Smart content extraction based on text density and element scoring + - Automatic removal of boilerplate content + - DOM tree analysis for better content identification + - Configurable thresholds for content detection +- Advanced proxy support: + - Added `proxy_config` option for authenticated proxy connections + - Support for username/password in proxy configuration +- New content output formats: + - `fit_markdown`: Optimized markdown output with main content focus + - `fit_html`: Clean HTML with only essential content + +### Enhanced +- Image source detection: + - Support for multiple image source attributes (`src`, `data-src`, `srcset`, etc.) + - Automatic fallback through potential source attributes + - Smart handling of srcset attribute +- External content handling: + - Made external link exclusion optional (disabled by default) + - Improved detection and handling of social media links + - Better control over external image filtering + +### Fixed +- Image extraction reliability with multiple source attribute checks +- External link and image handling logic for better accuracy + +### Developer Notes +- The new `ContentCleaningStrategy` uses configurable thresholds for customization +- Proxy configuration now supports more complex authentication scenarios +- Content extraction process now provides both regular and optimized outputs + +## [v0.3.72] - 2024-10-20 + +### Fixed +- Added support for parsing Base64 encoded images in WebScrapingStrategy + +### Added +- Forked and integrated a customized version of the html2text library for more control over Markdown generation +- New configuration options for controlling external content: + - Ability to exclude all external links + - Option to specify domains to exclude (default includes major social media platforms) + - Control over excluding external images + +### Changed +- Improved Markdown generation process: + - Added fine-grained control over character escaping in Markdown output + - Enhanced handling of code blocks and pre-formatted text +- Updated `AsyncPlaywrightCrawlerStrategy.close()` method to use a shorter sleep time (0.5 seconds instead of 500) +- Enhanced flexibility in `CosineStrategy` with a more generic `load_HF_embedding_model` function + +### Improved +- Optimized content scraping and processing for better efficiency +- Enhanced error handling and logging in various components + +### Developer Notes +- The customized html2text library is now located within the crawl4ai package +- New configuration options are available in the `config.py` file for external content handling +- The `WebScrapingStrategy` class has been updated to accommodate new external content exclusion options + +## [v0.3.71] - 2024-10-19 + +### Added +- New chunking strategies: + - `OverlappingWindowChunking`: Allows for overlapping chunks of text, useful for maintaining context between chunks. + - Enhanced `SlidingWindowChunking`: Improved to handle edge cases and last chunks more effectively. + +### Changed +- Updated `CHUNK_TOKEN_THRESHOLD` in config to 2048 tokens (2^11) for better compatibility with most LLM models. +- Improved `AsyncPlaywrightCrawlerStrategy.close()` method to use a shorter sleep time (0.5 seconds instead of 500), significantly reducing wait time when closing the crawler. +- Enhanced flexibility in `CosineStrategy`: + - Now uses a more generic `load_HF_embedding_model` function, allowing for easier swapping of embedding models. +- Updated `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy` for better JSON-based extraction. + +### Fixed +- Addressed potential issues with the sliding window chunking strategy to ensure all text is properly chunked. + +### Developer Notes +- Added more comprehensive docstrings to chunking strategies for better code documentation. +- Removed hardcoded device setting in `CosineStrategy`, now using the automatically detected device. +- Added a new example in `quickstart_async.py` for generating a knowledge graph from crawled content. + +These updates aim to provide more flexibility in text processing, improve performance, and enhance the overall capabilities of the crawl4ai library. The new chunking strategies, in particular, offer more options for handling large texts in various scenarios. + +## [v0.3.71] - 2024-10-18 + +### Changes +1. **Version Update**: + - Updated version number from 0.3.7 to 0.3.71. + +2. **Crawler Enhancements**: + - Added `sleep_on_close` option to AsyncPlaywrightCrawlerStrategy for delayed browser closure. + - Improved context creation with additional options: + - Enabled `accept_downloads` and `java_script_enabled`. + - Added a cookie to enable cookies by default. + +3. **Error Handling Improvements**: + - Enhanced error messages in AsyncWebCrawler's `arun` method. + - Updated error reporting format for better visibility and consistency. + +4. **Performance Optimization**: + - Commented out automatic page and context closure in `crawl` method to potentially improve performance in certain scenarios. + +### Documentation +- Updated quickstart notebook: + - Changed installation command to use the released package instead of GitHub repository. + - Updated kernel display name. + +### Developer Notes +- Minor code refactoring and cleanup. + +## [v0.3.7] - 2024-10-17 + +### New Features +1. **Enhanced Browser Stealth**: + - Implemented `playwright_stealth` for improved bot detection avoidance. + - Added `StealthConfig` for fine-tuned control over stealth parameters. + +2. **User Simulation**: + - New `simulate_user` option to mimic human-like interactions (mouse movements, clicks, keyboard presses). + +3. **Navigator Override**: + - Added `override_navigator` option to modify navigator properties, further improving bot detection evasion. + +4. **Improved iframe Handling**: + - New `process_iframes` parameter to extract and integrate iframe content into the main page. + +5. **Flexible Browser Selection**: + - Support for choosing between Chromium, Firefox, and WebKit browsers. + +6. **Include Links in Markdown**: + - Added support for including links in Markdown content, by definin g a new flag `include_links_on_markdown` in `crawl` method. + +### Improvements +1. **Better Error Handling**: + - Enhanced error reporting in WebScrapingStrategy with detailed error messages and suggestions. + - Added console message and error logging for better debugging. + +2. **Image Processing Enhancements**: + - Improved image dimension updating and filtering logic. + +3. **Crawling Flexibility**: + - Added support for custom viewport sizes. + - Implemented delayed content retrieval with `delay_before_return_html` parameter. + +4. **Performance Optimization**: + - Adjusted default semaphore count for parallel crawling. + +### Bug Fixes +- Fixed an issue where the HTML content could be empty after processing. + +### Examples +- Added new example `crawl_with_user_simulation()` demonstrating the use of user simulation and navigator override features. + +### Developer Notes +- Refactored code for better maintainability and readability. +- Updated browser launch arguments for improved compatibility and performance. + +## [v0.3.6] - 2024-10-12 + +### 1. Improved Crawling Control +- **New Hook**: Added `before_retrieve_html` hook in `AsyncPlaywrightCrawlerStrategy`. +- **Delayed HTML Retrieval**: Introduced `delay_before_return_html` parameter to allow waiting before retrieving HTML content. + - Useful for pages with delayed content loading. +- **Flexible Timeout**: `smart_wait` function now uses `page_timeout` (default 60 seconds) instead of a fixed 30-second timeout. + - Provides better handling for slow-loading pages. +- **How to use**: Set `page_timeout=your_desired_timeout` (in milliseconds) when calling `crawler.arun()`. + +### 2. Browser Type Selection +- Added support for different browser types (Chromium, Firefox, WebKit). +- Users can now specify the browser type when initializing AsyncWebCrawler. +- **How to use**: Set `browser_type="firefox"` or `browser_type="webkit"` when initializing AsyncWebCrawler. + +### 3. Screenshot Capture +- Added ability to capture screenshots during crawling. +- Useful for debugging and content verification. +- **How to use**: Set `screenshot=True` when calling `crawler.arun()`. + +### 4. Enhanced LLM Extraction Strategy +- Added support for multiple LLM providers (OpenAI, Hugging Face, Ollama). +- **Custom Arguments**: Added support for passing extra arguments to LLM providers via `extra_args` parameter. +- **Custom Headers**: Users can now pass custom headers to the extraction strategy. +- **How to use**: Specify the desired provider and custom arguments when using `LLMExtractionStrategy`. + +### 5. iframe Content Extraction +- New feature to process and extract content from iframes. +- **How to use**: Set `process_iframes=True` in the crawl method. + +### 6. Delayed Content Retrieval +- Introduced `get_delayed_content` method in `AsyncCrawlResponse`. +- Allows retrieval of content after a specified delay, useful for dynamically loaded content. +- **How to use**: Access `result.get_delayed_content(delay_in_seconds)` after crawling. + +### Improvements and Optimizations + +#### 1. AsyncWebCrawler Enhancements +- **Flexible Initialization**: Now accepts arbitrary keyword arguments, passed directly to the crawler strategy. +- Allows for more customized setups. + +#### 2. Image Processing Optimization +- Enhanced image handling in WebScrapingStrategy. +- Added filtering for small, invisible, or irrelevant images. +- Improved image scoring system for better content relevance. +- Implemented JavaScript-based image dimension updating for more accurate representation. + +#### 3. Database Schema Auto-updates +- Automatic database schema updates ensure compatibility with the latest version. + +#### 4. Enhanced Error Handling and Logging +- Improved error messages and logging for easier debugging. + +#### 5. Content Extraction Refinements +- Refined HTML sanitization process. +- Improved handling of base64 encoded images. +- Enhanced Markdown conversion process. +- Optimized content extraction algorithms. + +#### 6. Utility Function Enhancements +- `perform_completion_with_backoff` function now supports additional arguments for more customized API calls to LLM providers. + +### Bug Fixes +- Fixed an issue where image tags were being prematurely removed during content extraction. + +### Examples and Documentation +- Updated `quickstart_async.py` with examples of: + - Using custom headers in LLM extraction. + - Different LLM provider usage (OpenAI, Hugging Face, Ollama). + - Custom browser type usage. + +### Developer Notes +- Refactored code for better maintainability, flexibility, and performance. +- Enhanced type hinting throughout the codebase for improved development experience. +- Expanded error handling for more robust operation. + +These updates significantly enhance the flexibility, accuracy, and robustness of crawl4ai, providing users with more control and options for their web crawling and content extraction tasks. + +## [v0.3.5] - 2024-09-02 + +Enhance AsyncWebCrawler with smart waiting and screenshot capabilities + +- Implement smart_wait function in AsyncPlaywrightCrawlerStrategy +- Add screenshot support to AsyncCrawlResponse and AsyncWebCrawler +- Improve error handling and timeout management in crawling process +- Fix typo in CrawlResult model (responser_headers -> response_headers) + +## [v0.2.77] - 2024-08-04 + +Significant improvements in text processing and performance: + +- 🚀 **Dependency reduction**: Removed dependency on spaCy model for text chunk labeling in cosine extraction strategy. +- 🤖 **Transformer upgrade**: Implemented text sequence classification using a transformer model for labeling text chunks. +- ⚡ **Performance enhancement**: Improved model loading speed due to removal of spaCy dependency. +- 🔧 **Future-proofing**: Laid groundwork for potential complete removal of spaCy dependency in future versions. + +These changes address issue #68 and provide a foundation for faster, more efficient text processing in Crawl4AI. + +## [v0.2.76] - 2024-08-02 + +Major improvements in functionality, performance, and cross-platform compatibility! 🚀 + +- 🐳 **Docker enhancements**: Significantly improved Dockerfile for easy installation on Linux, Mac, and Windows. +- 🌐 **Official Docker Hub image**: Launched our first official image on Docker Hub for streamlined deployment. +- 🔧 **Selenium upgrade**: Removed dependency on ChromeDriver, now using Selenium's built-in capabilities for better compatibility. +- 🖼️ **Image description**: Implemented ability to generate textual descriptions for extracted images from web pages. +- ⚡ **Performance boost**: Various improvements to enhance overall speed and performance. + +A big shoutout to our amazing community contributors: +- [@aravindkarnam](https://github.com/aravindkarnam) for developing the textual description extraction feature. +- [@FractalMind](https://github.com/FractalMind) for creating the first official Docker Hub image and fixing Dockerfile errors. +- [@ketonkss4](https://github.com/ketonkss4) for identifying Selenium's new capabilities, helping us reduce dependencies. + +Your contributions are driving Crawl4AI forward! 🙌 + +## [v0.2.75] - 2024-07-19 + +Minor improvements for a more maintainable codebase: + +- 🔄 Fixed typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability +- 🔄 Removed `.test_pads/` directory from `.gitignore` to keep our repository clean and organized + +These changes may seem small, but they contribute to a more stable and sustainable codebase. By fixing typos and updating our `.gitignore` settings, we're ensuring that our code is easier to maintain and scale in the long run. + +## [v0.2.74] - 2024-07-08 +A slew of exciting updates to improve the crawler's stability and robustness! 🎉 + +- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding. +- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy. +- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy. +- 🚮 **Database cleanup**: Removed existing database file and initialized a new one. + + +## [v0.2.73] - 2024-07-03 + +💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project. + +* Supporting website need "with-head" mode to crawl the website with head. +* Fixing the installation issues for setup.py and dockerfile. +* Resolve multiple issues. + +## [v0.2.72] - 2024-06-30 + +This release brings exciting updates and improvements to our project! 🎉 + +* 📚 **Documentation Updates**: Our documentation has been revamped to reflect the latest changes and additions. +* 🚀 **New Modes in setup.py**: We've added support for three new modes in setup.py: default, torch, and transformers. This enhances the project's flexibility and usability. +* 🐳 **Docker File Updates**: The Docker file has been updated to ensure seamless compatibility with the new modes and improvements. +* 🕷️ **Temporary Solution for Headless Crawling**: We've implemented a temporary solution to overcome issues with crawling websites in headless mode. + +These changes aim to improve the overall user experience, provide more flexibility, and enhance the project's performance. We're thrilled to share these updates with you and look forward to continuing to evolve and improve our project! + +## [0.2.71] - 2024-06-26 + +**Improved Error Handling and Performance** 🚧 + +* 🚫 Refactored `crawler_strategy.py` to handle exceptions and provide better error messages, making it more robust and reliable. +* 💻 Optimized the `get_content_of_website_optimized` function in `utils.py` for improved performance, reducing potential bottlenecks. +* 💻 Updated `utils.py` with the latest changes, ensuring consistency and accuracy. +* 🚫 Migrated to `ChromeDriverManager` to resolve Chrome driver download issues, providing a smoother user experience. + +These changes focus on refining the existing codebase, resulting in a more stable, efficient, and user-friendly experience. With these improvements, you can expect fewer errors and better performance in the crawler strategy and utility functions. + +## [0.2.71] - 2024-06-25 +### Fixed +- Speed up twice the extraction function. + + +## [0.2.6] - 2024-06-22 +### Fixed +- Fix issue #19: Update Dockerfile to ensure compatibility across multiple platforms. + +## [0.2.5] - 2024-06-18 +### Added +- Added five important hooks to the crawler: + - on_driver_created: Called when the driver is ready for initializations. + - before_get_url: Called right before Selenium fetches the URL. + - after_get_url: Called after Selenium fetches the URL. + - before_return_html: Called when the data is parsed and ready. + - on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize. +- Added an example in `quickstart.py` in the example folder under the docs. +- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM. +- Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness. +- Updated Dockerfile to ensure compatibility across multiple platforms (Hopefully!). + +## [v0.2.4] - 2024-06-17 +### Fixed +- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..31dad7b9ae1f54c188b6379a35b6e189f0659271 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,131 @@ +# Crawl4AI Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official email address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +unclecode@crawl4ai.com. All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. + +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at +[https://www.contributor-covenant.org/translations][translations]. + +[homepage]: https://www.contributor-covenant.org +[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html +[Mozilla CoC]: https://github.com/mozilla/diversity +[FAQ]: https://www.contributor-covenant.org/faq +[translations]: https://www.contributor-covenant.org/translations diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md new file mode 100644 index 0000000000000000000000000000000000000000..79038bdd5ced7f03747c9cb4246af1ec1793c966 --- /dev/null +++ b/CONTRIBUTORS.md @@ -0,0 +1,42 @@ +# Contributors to Crawl4AI + +We would like to thank the following people for their contributions to Crawl4AI: + +## Core Team + +- [Unclecode](https://github.com/unclecode) - Project Creator and Main Developer +- [Nasrin](https://github.com/ntohidi) - Project Manager and Developer +- [Aravind Karnam](https://github.com/aravindkarnam) - Developer + +## Community Contributors + +- [aadityakanjolia4](https://github.com/aadityakanjolia4) - Fix for `CustomHTML2Text` is not defined. +- [FractalMind](https://github.com/FractalMind) - Created the first official Docker Hub image and fixed Dockerfile errors +- [ketonkss4](https://github.com/ketonkss4) - Identified Selenium's new capabilities, helping reduce dependencies +- [jonymusky](https://github.com/jonymusky) - Javascript execution documentation, and wait_for +- [datehoer](https://github.com/datehoer) - Add browser prxy support + +## Pull Requests + +- [dvschuyl](https://github.com/dvschuyl) - AsyncPlaywrightCrawlerStrategy page-evaluate context destroyed by navigation [#304](https://github.com/unclecode/crawl4ai/pull/304) +- [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286) +- [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293) +- [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271) +- [paulokuong](https://github.com/paulokuong) - fix: RAWL4_AI_BASE_DIRECTORY should be Path object instead of string [#298](https://github.com/unclecode/crawl4ai/pull/298) + + +## Other Contributors + +- [Gokhan](https://github.com/gkhngyk) +- [Shiv Kumar](https://github.com/shivkumar0757) +- [QIN2DIM](https://github.com/QIN2DIM) + +## Acknowledgements + +We also want to thank all the users who have reported bugs, suggested features, or helped in any other way to make Crawl4AI better. + +--- + +If you've contributed to Crawl4AI and your name isn't on this list, please [open a pull request](https://github.com/unclecode/crawl4ai/pulls) with your name, link, and contribution, and we'll review it promptly. + +Thank you all for your contributions! \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..b51305e6fbed6719abf2484740beb5e934d4c047 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,136 @@ +# syntax=docker/dockerfile:1.4 + +ARG TARGETPLATFORM +ARG BUILDPLATFORM + +# Other build arguments +ARG PYTHON_VERSION=3.10 + +# Base stage with system dependencies +FROM python:${PYTHON_VERSION}-slim as base + +# Declare ARG variables again within the build stage +ARG INSTALL_TYPE=basic +ARG ENABLE_GPU=false + +# Platform-specific labels +LABEL maintainer="unclecode" +LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" +LABEL version="1.0" + +# Environment setup +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_DEFAULT_TIMEOUT=100 \ + DEBIAN_FRONTEND=noninteractive + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + wget \ + gnupg \ + git \ + cmake \ + pkg-config \ + python3-dev \ + libjpeg-dev \ + libpng-dev \ + && rm -rf /var/lib/apt/lists/* + +# Playwright system dependencies for Linux +RUN apt-get update && apt-get install -y --no-install-recommends \ + libglib2.0-0 \ + libnss3 \ + libnspr4 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libdrm2 \ + libdbus-1-3 \ + libxcb1 \ + libxkbcommon0 \ + libx11-6 \ + libxcomposite1 \ + libxdamage1 \ + libxext6 \ + libxfixes3 \ + libxrandr2 \ + libgbm1 \ + libpango-1.0-0 \ + libcairo2 \ + libasound2 \ + libatspi2.0-0 \ + && rm -rf /var/lib/apt/lists/* + +# GPU support if enabled and architecture is supported +RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \ + apt-get update && apt-get install -y --no-install-recommends \ + nvidia-cuda-toolkit \ + && rm -rf /var/lib/apt/lists/* ; \ +else \ + echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \ +fi + +# Create and set working directory +WORKDIR /app + +# Copy the entire project +COPY . . + +# Install base requirements +RUN pip install --no-cache-dir -r requirements.txt + +# Install required library for FastAPI +RUN pip install fastapi uvicorn psutil + +# Install ML dependencies first for better layer caching +RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ + pip install --no-cache-dir \ + torch \ + torchvision \ + torchaudio \ + scikit-learn \ + nltk \ + transformers \ + tokenizers && \ + python -m nltk.downloader punkt stopwords ; \ + fi + +# Install the package +RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ + pip install ".[all]" && \ + python -m crawl4ai.model_loader ; \ + elif [ "$INSTALL_TYPE" = "torch" ] ; then \ + pip install ".[torch]" ; \ + elif [ "$INSTALL_TYPE" = "transformer" ] ; then \ + pip install ".[transformer]" && \ + python -m crawl4ai.model_loader ; \ + else \ + pip install "." ; \ + fi + + # Install MkDocs and required plugins +RUN pip install --no-cache-dir \ + mkdocs \ + mkdocs-material \ + mkdocs-terminal \ + pymdown-extensions + +# Build MkDocs documentation +RUN mkdocs build + +# Install Playwright and browsers +RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \ + playwright install chromium; \ + elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + playwright install chromium; \ + fi + +# Expose port +EXPOSE 8000 11235 9222 8080 + +# Start the FastAPI server +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"] \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..59711bc927cfed45472bc311dd23e5596842955d --- /dev/null +++ b/LICENSE @@ -0,0 +1,51 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of this License; and +You must cause any modified files to carry prominent notices stating that You changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. +You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..73a0e00b6b3ecaec89566bca8edaf037975b0095 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include requirements.txt +recursive-include crawl4ai/js_snippet *.js \ No newline at end of file diff --git a/MISSION.md b/MISSION.md new file mode 100644 index 0000000000000000000000000000000000000000..726f2505759d6b7e87f93277a4a1dd74400fbc18 --- /dev/null +++ b/MISSION.md @@ -0,0 +1,46 @@ +# Mission + +![Mission Diagram](./docs/assets/pitch-dark.svg) + +### 1. The Data Capitalization Opportunity + +We live in an unprecedented era of digital wealth creation. Every day, individuals and enterprises generate massive amounts of valuable digital footprints across various platforms, social media channels, messenger apps, and cloud services. While people can interact with their data within these platforms, there's an immense untapped opportunity to transform this data into true capital assets. Just as physical property became a foundational element of wealth creation, personal and enterprise data has the potential to become a new form of capital on balance sheets. + +For individuals, this represents an opportunity to transform their digital activities into valuable assets. For enterprises, their internal communications, team discussions, and collaborative documents contain rich insights that could be structured and valued as intellectual capital. This wealth of information represents an unprecedented opportunity for value creation in the digital age. + +### 2. The Potential of Authentic Data + +While synthetic data has played a crucial role in AI development, there's an enormous untapped potential in the authentic data generated by individuals and organizations. Every message, document, and interaction contains unique insights and patterns that could enhance AI development. The challenge isn't a lack of data - it's that most authentic human-generated data remains inaccessible for productive use. + +By enabling willing participation in data sharing, we can unlock this vast reservoir of authentic human knowledge. This represents an opportunity to enhance AI development with diverse, real-world data that reflects the full spectrum of human experience and knowledge. + +## Our Pathway to Data Democracy + +### 1. Open-Source Foundation + +Our first step is creating an open-source data extraction engine that empowers developers and innovators to build tools for data structuring and organization. This foundation ensures transparency, security, and community-driven development. By making these tools openly available, we enable the technical infrastructure needed for true data ownership and capitalization. + +### 2. Data Capitalization Platform + +Building on this open-source foundation, we're developing a platform that helps individuals and enterprises transform their digital footprints into structured, valuable assets. This platform will provide the tools and frameworks needed to organize, understand, and value personal and organizational data as true capital assets. + +### 3. Creating a Data Marketplace + +The final piece is establishing a marketplace where individuals and organizations can willingly share their data assets. This creates opportunities for: +- Individuals to earn equity, revenue, or other forms of value from their data +- Enterprises to access diverse, high-quality data for AI development +- Researchers to work with authentic human-generated data +- Startups to build innovative solutions using real-world data + +## Economic Vision: A Shared Data Economy + +We envision a future where data becomes a fundamental asset class in a thriving shared economy. This transformation will democratize AI development by enabling willing participation in data sharing, ensuring that the benefits of AI advancement flow back to data creators. Just as property rights revolutionized economic systems, establishing data as a capital asset will create new opportunities for wealth creation and economic participation. + +This shared data economy will: +- Enable individuals to capitalize on their digital footprints +- Create new revenue streams for data creators +- Provide AI developers with access to diverse, authentic data +- Foster innovation through broader access to real-world data +- Ensure more equitable distribution of AI's economic benefits + +Our vision is to facilitate this transformation from the ground up - starting with open-source tools, progressing to data capitalization platforms, and ultimately creating a thriving marketplace where data becomes a true asset class in a shared economy. This approach ensures that the future of AI is built on a foundation of authentic human knowledge, with benefits flowing back to the individuals and organizations who create and share their valuable data. \ No newline at end of file diff --git a/README.md b/README.md index 22f403d1953147ba9b07d71b4307846e020d80c3..c385aa185e6be82b3e0d27b0d4f374e9b07182bc 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,565 @@ colorTo: pink sdk: docker pinned: false license: mit +port: 11235 --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference + + +# 🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper. + +
+ +unclecode%2Fcrawl4ai | Trendshift + +[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) +[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) + +[![PyPI version](https://badge.fury.io/py/crawl4ai.svg)](https://badge.fury.io/py/crawl4ai) +[![Python Version](https://img.shields.io/pypi/pyversions/crawl4ai)](https://pypi.org/project/crawl4ai/) +[![Downloads](https://static.pepy.tech/badge/crawl4ai/month)](https://pepy.tech/project/crawl4ai) + + +[![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit) +[![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](code_of_conduct.md) + +
+ +Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. + +[✨ Check out latest update v0.4.24x](#-recent-updates) + +🎉 **Version 0.4.24x is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog) + +## 🧐 Why Crawl4AI? + +1. **Built for LLMs**: Creates smart, concise Markdown optimized for RAG and fine-tuning applications. +2. **Lightning Fast**: Delivers results 6x faster with real-time, cost-efficient performance. +3. **Flexible Browser Control**: Offers session management, proxies, and custom hooks for seamless data access. +4. **Heuristic Intelligence**: Uses advanced algorithms for efficient extraction, reducing reliance on costly models. +5. **Open Source & Deployable**: Fully open-source with no API keys—ready for Docker and cloud integration. +6. **Thriving Community**: Actively maintained by a vibrant community and the #1 trending GitHub repository. + +## 🚀 Quick Start + +1. Install Crawl4AI: +```bash +# Install the package +pip install -U crawl4ai + +# Run post-installation setup +crawl4ai-setup + +# Verify your installation +crawl4ai-doctor +``` + +If you encounter any browser-related issues, you can install them manually: +```bash +python -m playwright install --with-deps chromium +``` + +2. Run a simple web crawl: +```python +import asyncio +from crawl4ai import * + +async def main(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + ) + print(result.markdown) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## ✨ Features + +
+📝 Markdown Generation + +- 🧹 **Clean Markdown**: Generates clean, structured Markdown with accurate formatting. +- 🎯 **Fit Markdown**: Heuristic-based filtering to remove noise and irrelevant parts for AI-friendly processing. +- 🔗 **Citations and References**: Converts page links into a numbered reference list with clean citations. +- 🛠️ **Custom Strategies**: Users can create their own Markdown generation strategies tailored to specific needs. +- 📚 **BM25 Algorithm**: Employs BM25-based filtering for extracting core information and removing irrelevant content. +
+ +
+📊 Structured Data Extraction + +- 🤖 **LLM-Driven Extraction**: Supports all LLMs (open-source and proprietary) for structured data extraction. +- 🧱 **Chunking Strategies**: Implements chunking (topic-based, regex, sentence-level) for targeted content processing. +- 🌌 **Cosine Similarity**: Find relevant content chunks based on user queries for semantic extraction. +- 🔎 **CSS-Based Extraction**: Fast schema-based data extraction using XPath and CSS selectors. +- 🔧 **Schema Definition**: Define custom schemas for extracting structured JSON from repetitive patterns. + +
+ +
+🌐 Browser Integration + +- 🖥️ **Managed Browser**: Use user-owned browsers with full control, avoiding bot detection. +- 🔄 **Remote Browser Control**: Connect to Chrome Developer Tools Protocol for remote, large-scale data extraction. +- 🔒 **Session Management**: Preserve browser states and reuse them for multi-step crawling. +- 🧩 **Proxy Support**: Seamlessly connect to proxies with authentication for secure access. +- ⚙️ **Full Browser Control**: Modify headers, cookies, user agents, and more for tailored crawling setups. +- 🌍 **Multi-Browser Support**: Compatible with Chromium, Firefox, and WebKit. +- 📐 **Dynamic Viewport Adjustment**: Automatically adjusts the browser viewport to match page content, ensuring complete rendering and capturing of all elements. + +
+ +
+🔎 Crawling & Scraping + +- 🖼️ **Media Support**: Extract images, audio, videos, and responsive image formats like `srcset` and `picture`. +- 🚀 **Dynamic Crawling**: Execute JS and wait for async or sync for dynamic content extraction. +- 📸 **Screenshots**: Capture page screenshots during crawling for debugging or analysis. +- 📂 **Raw Data Crawling**: Directly process raw HTML (`raw:`) or local files (`file://`). +- 🔗 **Comprehensive Link Extraction**: Extracts internal, external links, and embedded iframe content. +- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior. +- 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches. +- 📄 **Metadata Extraction**: Retrieve structured metadata from web pages. +- 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content. +- 🕵️ **Lazy Load Handling**: Waits for images to fully load, ensuring no content is missed due to lazy loading. +- 🔄 **Full-Page Scanning**: Simulates scrolling to load and capture all dynamic content, perfect for infinite scroll pages. + +
+ +
+🚀 Deployment + +- 🐳 **Dockerized Setup**: Optimized Docker image with API server for easy deployment. +- 🔄 **API Gateway**: One-click deployment with secure token authentication for API-based workflows. +- 🌐 **Scalable Architecture**: Designed for mass-scale production and optimized server performance. +- ⚙️ **DigitalOcean Deployment**: Ready-to-deploy configurations for DigitalOcean and similar platforms. + +
+ +
+🎯 Additional Features + +- 🕶️ **Stealth Mode**: Avoid bot detection by mimicking real users. +- 🏷️ **Tag-Based Content Extraction**: Refine crawling based on custom tags, headers, or metadata. +- 🔗 **Link Analysis**: Extract and analyze all links for detailed data exploration. +- 🛡️ **Error Handling**: Robust error management for seamless execution. +- 🔐 **CORS & Static Serving**: Supports filesystem-based caching and cross-origin requests. +- 📖 **Clear Documentation**: Simplified and updated guides for onboarding and advanced usage. +- 🙌 **Community Recognition**: Acknowledges contributors and pull requests for transparency. + +
+ +## Try it Now! + +✨ Play around with this [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing) + +✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/) + +## Installation 🛠️ + +Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker. + +
+🐍 Using pip + +Choose the installation option that best fits your needs: + +### Basic Installation + +For basic web crawling and scraping tasks: + +```bash +pip install crawl4ai +crawl4ai-setup # Setup the browser +``` + +By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling. + +👉 **Note**: When you install Crawl4AI, the `crawl4ai-setup` should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: + +1. Through the command line: + + ```bash + playwright install + ``` + +2. If the above doesn't work, try this more specific command: + + ```bash + python -m playwright install chromium + ``` + +This second method has proven to be more reliable in some cases. + +--- + +### Installation with Synchronous Version + +The sync version is deprecated and will be removed in future versions. If you need the synchronous version using Selenium: + +```bash +pip install crawl4ai[sync] +``` + +--- + +### Development Installation + +For contributors who plan to modify the source code: + +```bash +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai +pip install -e . # Basic installation in editable mode +``` + +Install optional features: + +```bash +pip install -e ".[torch]" # With PyTorch features +pip install -e ".[transformer]" # With Transformer features +pip install -e ".[cosine]" # With cosine similarity features +pip install -e ".[sync]" # With synchronous crawling (Selenium) +pip install -e ".[all]" # Install all optional features +``` + +
+ +
+🐳 Docker Deployment + +> 🚀 **Major Changes Coming!** We're developing a completely new Docker implementation that will make deployment even more efficient and seamless. The current Docker setup is being deprecated in favor of this new solution. + +### Current Docker Support + +The existing Docker implementation is being deprecated and will be replaced soon. If you still need to use Docker with the current version: + +- 📚 [Deprecated Docker Setup](./docs/deprecated/docker-deployment.md) - Instructions for the current Docker implementation +- ⚠️ Note: This setup will be replaced in the next major release + +### What's Coming Next? + +Our new Docker implementation will bring: +- Improved performance and resource efficiency +- Streamlined deployment process +- Better integration with Crawl4AI features +- Enhanced scalability options + +Stay connected with our [GitHub repository](https://github.com/unclecode/crawl4ai) for updates! + +
+ +--- + +### Quick Test + +Run a quick test (works for both Docker options): + +```python +import requests + +# Submit a crawl job +response = requests.post( + "http://localhost:11235/crawl", + json={"urls": "https://example.com", "priority": 10} +) +task_id = response.json()["task_id"] + +# Continue polling until the task is complete (status="completed") +result = requests.get(f"http://localhost:11235/task/{task_id}") +``` + +For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/). + + + + +## 🔬 Advanced Usage Examples 🔬 + +You can check the project structure in the directory [https://github.com/unclecode/crawl4ai/docs/examples](docs/examples). Over there, you can find a variety of examples; here, some popular examples are shared. + +
+📝 Heuristic Markdown Generation with Clean and Fit Markdown + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def main(): + browser_config = BrowserConfig( + headless=True, + verbose=True, + ) + run_config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0) + ), + # markdown_generator=DefaultMarkdownGenerator( + # content_filter=BM25ContentFilter(user_query="WHEN_WE_FOCUS_BASED_ON_A_USER_QUERY", bm25_threshold=1.0) + # ), + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://docs.micronaut.io/4.7.6/guide/", + config=run_config + ) + print(len(result.markdown)) + print(len(result.fit_markdown)) + print(len(result.markdown_v2.fit_markdown)) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +
+ +
+🖥️ Executing JavaScript & Extract Structured Data without LLMs + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +import json + +async def main(): + schema = { + "name": "KidoCode Courses", + "baseSelector": "section.charge-methodology .w-tab-content > div", + "fields": [ + { + "name": "section_title", + "selector": "h3.heading-50", + "type": "text", + }, + { + "name": "section_description", + "selector": ".charge-content", + "type": "text", + }, + { + "name": "course_name", + "selector": ".text-block-93", + "type": "text", + }, + { + "name": "course_description", + "selector": ".course-content-text", + "type": "text", + }, + { + "name": "course_icon", + "selector": ".image-92", + "type": "attribute", + "attribute": "src" + } + } +} + + extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) + + browser_config = BrowserConfig( + headless=False, + verbose=True + ) + run_config = CrawlerRunConfig( + extraction_strategy=extraction_strategy, + js_code=["""(async () => {const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");for(let tab of tabs) {tab.scrollIntoView();tab.click();await new Promise(r => setTimeout(r, 500));}})();"""], + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + + result = await crawler.arun( + url="https://www.kidocode.com/degrees/technology", + config=run_config + ) + + companies = json.loads(result.extracted_content) + print(f"Successfully extracted {len(companies)} companies") + print(json.dumps(companies[0], indent=2)) + + +if __name__ == "__main__": + asyncio.run(main()) +``` + +
+ +
+📚 Extracting Structured Data with LLMs + +```python +import os +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import LLMExtractionStrategy +from pydantic import BaseModel, Field + +class OpenAIModelFee(BaseModel): + model_name: str = Field(..., description="Name of the OpenAI model.") + input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") + output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") + +async def main(): + browser_config = BrowserConfig(verbose=True) + run_config = CrawlerRunConfig( + word_count_threshold=1, + extraction_strategy=LLMExtractionStrategy( + # Here you can use any provider that Litellm library supports, for instance: ollama/qwen2 + # provider="ollama/qwen2", api_token="no-token", + provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'), + schema=OpenAIModelFee.schema(), + extraction_type="schema", + instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. + Do not miss any models in the entire content. One extracted model JSON format should look like this: + {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""" + ), + cache_mode=CacheMode.BYPASS, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url='https://openai.com/api/pricing/', + config=run_config + ) + print(result.extracted_content) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +
+ +
+🤖 Using You own Browswer with Custome User Profile + +```python +import os, sys +from pathlib import Path +import asyncio, time +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + +async def test_news_crawl(): + # Create a persistent user data directory + user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile") + os.makedirs(user_data_dir, exist_ok=True) + + browser_config = BrowserConfig( + verbose=True, + headless=True, + user_data_dir=user_data_dir, + use_persistent_context=True, + ) + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + url = "ADDRESS_OF_A_CHALLENGING_WEBSITE" + + result = await crawler.arun( + url, + config=run_config, + magic=True, + ) + + print(f"Successfully crawled {url}") + print(f"Content length: {len(result.markdown)}") +``` + +
+ + +## ✨ Recent Updates + +- 🔒 **Enhanced SSL & Security**: New SSL certificate handling with custom paths and validation options for secure crawling +- 🔍 **Smart Content Filtering**: Advanced filtering system with regex support and efficient chunking strategies +- 📦 **Improved JSON Extraction**: Support for complex JSONPath, JSON-CSS, and Microdata extraction +- 🏗️ **New Field Types**: Added `computed`, `conditional`, `aggregate`, and `template` field types +- ⚡ **Performance Boost**: Optimized caching, parallel processing, and memory management +- 🐛 **Better Error Handling**: Enhanced debugging capabilities with detailed error tracking +- 🔐 **Security Features**: Improved input validation and safe expression evaluation + +Read the full details of this release in our [0.4.24 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md). + +## 📖 Documentation & Roadmap + +> 🚨 **Documentation Update Alert**: We're undertaking a major documentation overhaul next week to reflect recent updates and improvements. Stay tuned for a more comprehensive and up-to-date guide! + +For current documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/). + +To check our development plans and upcoming features, visit our [Roadmap](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md). + +
+📈 Development TODOs + +- [x] 0. Graph Crawler: Smart website traversal using graph search algorithms for comprehensive nested page extraction +- [ ] 1. Question-Based Crawler: Natural language driven web discovery and content extraction +- [ ] 2. Knowledge-Optimal Crawler: Smart crawling that maximizes knowledge while minimizing data extraction +- [ ] 3. Agentic Crawler: Autonomous system for complex multi-step crawling operations +- [ ] 4. Automated Schema Generator: Convert natural language to extraction schemas +- [ ] 5. Domain-Specific Scrapers: Pre-configured extractors for common platforms (academic, e-commerce) +- [ ] 6. Web Embedding Index: Semantic search infrastructure for crawled content +- [ ] 7. Interactive Playground: Web UI for testing, comparing strategies with AI assistance +- [ ] 8. Performance Monitor: Real-time insights into crawler operations +- [ ] 9. Cloud Integration: One-click deployment solutions across cloud providers +- [ ] 10. Sponsorship Program: Structured support system with tiered benefits +- [ ] 11. Educational Content: "How to Crawl" video series and interactive tutorials + +
+ +## 🤝 Contributing + +We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information. + +## 📄 License + +Crawl4AI is released under the [Apache 2.0 License](https://github.com/unclecode/crawl4ai/blob/main/LICENSE). + +## 📧 Contact + +For questions, suggestions, or feedback, feel free to reach out: + +- GitHub: [unclecode](https://github.com/unclecode) +- Twitter: [@unclecode](https://twitter.com/unclecode) +- Website: [crawl4ai.com](https://crawl4ai.com) + +Happy Crawling! 🕸️🚀 + +## 🗾 Mission + +Our mission is to unlock the value of personal and enterprise data by transforming digital footprints into structured, tradeable assets. Crawl4AI empowers individuals and organizations with open-source tools to extract and structure data, fostering a shared data economy. + +We envision a future where AI is powered by real human knowledge, ensuring data creators directly benefit from their contributions. By democratizing data and enabling ethical sharing, we are laying the foundation for authentic AI advancement. + +
+🔑 Key Opportunities + +- **Data Capitalization**: Transform digital footprints into measurable, valuable assets. +- **Authentic AI Data**: Provide AI systems with real human insights. +- **Shared Economy**: Create a fair data marketplace that benefits data creators. + +
+ +
+🚀 Development Pathway + +1. **Open-Source Tools**: Community-driven platforms for transparent data extraction. +2. **Digital Asset Structuring**: Tools to organize and value digital knowledge. +3. **Ethical Data Marketplace**: A secure, fair platform for exchanging structured data. + +For more details, see our [full mission statement](./MISSION.md). +
+ +## Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date) diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 0000000000000000000000000000000000000000..0fd784c13d75c008fa5671ef3bdf1553795158dd --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,503 @@ +# Crawl4AI Strategic Roadmap + +```mermaid +%%{init: {'themeVariables': { 'fontSize': '14px'}}}%% +graph TD + subgraph A1[Advanced Crawling Systems 🔧] + A["` + • Graph Crawler ✓ + • Question-Based Crawler + • Knowledge-Optimal Crawler + • Agentic Crawler + `"] + end + + subgraph A2[Specialized Features 🛠️] + B["` + • Automated Schema Generator + • Domain-Specific Scrapers + • + • + `"] + end + + subgraph A3[Development Tools 🔨] + C["` + • Interactive Playground + • Performance Monitor + • Cloud Integration + • + `"] + end + + subgraph A4[Community & Growth 🌱] + D["` + • Sponsorship Program + • Educational Content + • + • + `"] + end + + classDef default fill:#f9f9f9,stroke:#333,stroke-width:2px + classDef section fill:#f0f0f0,stroke:#333,stroke-width:4px,rx:10 + class A1,A2,A3,A4 section + + %% Layout hints + A1 --> A2[" "] + A3 --> A4[" "] + linkStyle 0,1 stroke:none +``` + +Crawl4AI is evolving to provide more intelligent, efficient, and versatile web crawling capabilities. This roadmap outlines the key developments and features planned for the project, organized into strategic sections that build upon our current foundation. + +## 1. Advanced Crawling Systems 🔧 + +This section introduces three powerful crawling systems that extend Crawl4AI's capabilities from basic web crawling to intelligent, purpose-driven data extraction. + +### 1.1 Question-Based Crawler +The Question-Based Crawler enhances our core engine by enabling automatic discovery and extraction of relevant web content based on natural language questions. + +Key Features: +- SerpiAPI integration for intelligent web search +- Relevancy scoring for search results +- Automatic URL discovery and prioritization +- Cross-source validation + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.discovery import QuestionBasedDiscovery + +async with AsyncWebCrawler() as crawler: + discovery = QuestionBasedDiscovery(crawler) + results = await discovery.arun( + question="What are the system requirements for major cloud providers' GPU instances?", + max_urls=5, + relevance_threshold=0.7 + ) + + for result in results: + print(f"Source: {result.url} (Relevance: {result.relevance_score})") + print(f"Content: {result.markdown}\n") +``` + +### 1.2 Knowledge-Optimal Crawler +An intelligent crawling system that solves the optimization problem of minimizing data extraction while maximizing knowledge acquisition for specific objectives. + +Key Features: +- Smart content prioritization +- Minimal data extraction for maximum knowledge +- Probabilistic relevance assessment +- Objective-driven crawling paths + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.optimization import KnowledgeOptimizer + +async with AsyncWebCrawler() as crawler: + optimizer = KnowledgeOptimizer( + objective="Understand GPU instance pricing and limitations across cloud providers", + required_knowledge=[ + "pricing structure", + "GPU specifications", + "usage limits", + "availability zones" + ], + confidence_threshold=0.85 + ) + + result = await crawler.arun( + urls=[ + "https://aws.amazon.com/ec2/pricing/", + "https://cloud.google.com/gpu", + "https://azure.microsoft.com/pricing/" + ], + optimizer=optimizer, + optimization_mode="minimal_extraction" + ) + + print(f"Knowledge Coverage: {result.knowledge_coverage}") + print(f"Data Efficiency: {result.efficiency_ratio}") + print(f"Extracted Content: {result.optimal_content}") +``` + +### 1.3 Agentic Crawler +An autonomous system capable of understanding complex goals and automatically planning and executing multi-step crawling operations. + +Key Features: +- Autonomous goal interpretation +- Dynamic step planning +- Interactive navigation capabilities +- Visual recognition and interaction +- Automatic error recovery + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.agents import CrawlerAgent + +async with AsyncWebCrawler() as crawler: + agent = CrawlerAgent(crawler) + + # Automatic planning and execution + result = await agent.arun( + goal="Find research papers about quantum computing published in 2023 with more than 50 citations", + auto_retry=True + ) + print("Generated Plan:", result.executed_steps) + print("Extracted Data:", result.data) + + # Using custom steps with automatic execution + result = await agent.arun( + goal="Extract conference deadlines from ML conferences", + custom_plan=[ + "Navigate to conference page", + "Find important dates section", + "Extract submission deadlines", + "Verify dates are for 2024" + ] + ) + + # Monitoring execution + print("Step Completion:", result.step_status) + print("Execution Time:", result.execution_time) + print("Success Rate:", result.success_rate) +``` + +# Section 2: Specialized Features 🛠️ + +This section introduces specialized tools and features that enhance Crawl4AI's capabilities for specific use cases and data extraction needs. + +### 2.1 Automated Schema Generator +A system that automatically generates JsonCssExtractionStrategy schemas from natural language descriptions, making structured data extraction accessible to all users. + +Key Features: +- Natural language schema generation +- Automatic pattern detection +- Predefined schema templates +- Chrome extension for visual schema building + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.schema import SchemaGenerator + +# Generate schema from natural language description +generator = SchemaGenerator() +schema = await generator.generate( + url="https://news-website.com", + description="For each news article on the page, I need the headline, publication date, and main image" +) + +# Use generated schema with crawler +async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://news-website.com", + extraction_strategy=schema + ) + +# Example of generated schema: +""" +{ + "name": "News Article Extractor", + "baseSelector": "article.news-item", + "fields": [ + { + "name": "headline", + "selector": "h2.article-title", + "type": "text" + }, + { + "name": "date", + "selector": "span.publish-date", + "type": "text" + }, + { + "name": "image", + "selector": "img.article-image", + "type": "attribute", + "attribute": "src" + } + ] +} +""" +``` + +### 2.2 Domain Specific Scrapers +Specialized extraction strategies optimized for common website types and platforms, providing consistent and reliable data extraction without additional configuration. + +Key Features: +- Pre-configured extractors for popular platforms +- Academic site specialization (arXiv, NCBI) +- E-commerce standardization +- Documentation site handling + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.extractors import AcademicExtractor, EcommerceExtractor + +async with AsyncWebCrawler() as crawler: + # Academic paper extraction + papers = await crawler.arun( + url="https://arxiv.org/list/cs.AI/recent", + extractor="academic", # Built-in extractor type + site_type="arxiv", # Specific site optimization + extract_fields=[ + "title", + "authors", + "abstract", + "citations" + ] + ) + + # E-commerce product data + products = await crawler.arun( + url="https://store.example.com/products", + extractor="ecommerce", + extract_fields=[ + "name", + "price", + "availability", + "reviews" + ] + ) +``` + +### 2.3 Web Embedding Index +Creates and maintains a semantic search infrastructure for crawled content, enabling efficient retrieval and querying of web content through vector embeddings. + +Key Features: +- Automatic embedding generation +- Intelligent content chunking +- Efficient vector storage and indexing +- Semantic search capabilities + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.indexing import WebIndex + +# Initialize and build index +index = WebIndex(model="efficient-mini") + +async with AsyncWebCrawler() as crawler: + # Crawl and index content + await index.build( + urls=["https://docs.example.com"], + crawler=crawler, + options={ + "chunk_method": "semantic", + "update_policy": "incremental", + "embedding_batch_size": 100 + } + ) + + # Search through indexed content + results = await index.search( + query="How to implement OAuth authentication?", + filters={ + "content_type": "technical", + "recency": "6months" + }, + top_k=5 + ) + + # Get similar content + similar = await index.find_similar( + url="https://docs.example.com/auth/oauth", + threshold=0.85 + ) +``` + +Each of these specialized features builds upon Crawl4AI's core functionality while providing targeted solutions for specific use cases. They can be used independently or combined for more complex data extraction and processing needs. + +# Section 3: Development Tools 🔧 + +This section covers tools designed to enhance the development experience, monitoring, and deployment of Crawl4AI applications. + +### 3.1 Crawl4AI Playground 🎮 + +The Crawl4AI Playground is an interactive web-based development environment that simplifies web scraping experimentation, development, and deployment. With its intuitive interface and AI-powered assistance, users can quickly prototype, test, and deploy web scraping solutions. + +#### Key Features 🌟 + +##### Visual Strategy Builder +- Interactive point-and-click interface for building extraction strategies +- Real-time preview of selected elements +- Side-by-side comparison of different extraction approaches +- Visual validation of CSS selectors and XPath queries + +##### AI Assistant Integration +- Strategy recommendations based on target website analysis +- Parameter optimization suggestions +- Best practices guidance for specific use cases +- Automated error detection and resolution +- Performance optimization tips + +##### Real-Time Testing & Validation +- Live preview of extraction results +- Side-by-side comparison of multiple strategies +- Performance metrics visualization +- Automatic validation of extracted data +- Error detection and debugging tools + +##### Project Management +- Save and organize multiple scraping projects +- Version control for configurations +- Export/import project settings +- Share configurations with team members +- Project templates for common use cases + +##### Deployment Pipeline +- One-click deployment to various environments +- Docker container generation +- Cloud deployment templates (AWS, GCP, Azure) +- Scaling configuration management +- Monitoring setup automation + + +### 3.2 Performance Monitoring System +A comprehensive monitoring solution providing real-time insights into crawler operations, resource usage, and system health through both CLI and GUI interfaces. + +Key Features: +- Real-time resource tracking +- Active crawl monitoring +- Performance statistics +- Customizable alerting system + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.monitor import CrawlMonitor + +# Initialize monitoring +monitor = CrawlMonitor() + +# Start monitoring with CLI interface +await monitor.start( + mode="cli", # or "gui" + refresh_rate="1s", + metrics={ + "resources": ["cpu", "memory", "network"], + "crawls": ["active", "queued", "completed"], + "performance": ["success_rate", "response_times"] + } +) + +# Example CLI output: +""" +Crawl4AI Monitor (Live) - Press Q to exit +──────────────────────────────────────── +System Usage: + ├─ CPU: ███████░░░ 70% + └─ Memory: ████░░░░░ 2.1GB/8GB + +Active Crawls: +ID URL Status Progress +001 docs.example.com 🟢 Active 75% +002 api.service.com 🟡 Queue - + +Metrics (Last 5min): + ├─ Success Rate: 98% + ├─ Avg Response: 0.6s + └─ Pages/sec: 8.5 +""" +``` + +### 3.3 Cloud Integration +Streamlined deployment tools for setting up Crawl4AI in various cloud environments, with support for scaling and monitoring. + +Key Features: +- One-click deployment solutions +- Auto-scaling configuration +- Load balancing setup +- Cloud-specific optimizations +- Monitoring integration + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.deploy import CloudDeployer + +# Initialize deployer +deployer = CloudDeployer() + +# Deploy crawler service +deployment = await deployer.deploy( + service_name="crawler-cluster", + platform="aws", # or "gcp", "azure" + config={ + "instance_type": "compute-optimized", + "auto_scaling": { + "min_instances": 2, + "max_instances": 10, + "scale_based_on": "cpu_usage" + }, + "region": "us-east-1", + "monitoring": True + } +) + +# Get deployment status and endpoints +print(f"Service Status: {deployment.status}") +print(f"API Endpoint: {deployment.endpoint}") +print(f"Monitor URL: {deployment.monitor_url}") +``` + +These development tools work together to provide a comprehensive environment for developing, testing, monitoring, and deploying Crawl4AI applications. The Playground helps users experiment and generate optimal configurations, the Performance Monitor ensures smooth operation, and the Cloud Integration tools simplify deployment and scaling. + +# Section 4: Community & Growth 🌱 + +This section outlines initiatives designed to build and support the Crawl4AI community, provide educational resources, and ensure sustainable project growth. + +### 4.1 Sponsorship Program +A structured program to support ongoing development and maintenance of Crawl4AI while providing valuable benefits to sponsors. + +Key Features: +- Multiple sponsorship tiers +- Sponsor recognition system +- Priority support for sponsors +- Early access to new features +- Custom feature development opportunities + +Program Structure (not yet finalized): +``` +Sponsorship Tiers: + +🥉 Bronze Supporter +- GitHub Sponsor badge +- Priority issue response +- Community Discord role + +🥈 Silver Supporter +- All Bronze benefits +- Technical support channel +- Vote on roadmap priorities +- Early access to beta features + +🥇 Gold Supporter +- All Silver benefits +- Custom feature requests +- Direct developer access +- Private support sessions + +💎 Diamond Partner +- All Gold benefits +- Custom development +- On-demand consulting +- Integration support +``` + +### 4.2 "How to Crawl" Video Series +A comprehensive educational resource teaching users how to effectively use Crawl4AI for various web scraping and data extraction scenarios. + +Key Features: +- Step-by-step tutorials +- Real-world use cases +- Best practices +- Integration guides +- Advanced feature deep-dives + +These community initiatives are designed to: +- Provide comprehensive learning resources +- Foster a supportive user community +- Ensure sustainable project development +- Share knowledge and best practices +- Create opportunities for collaboration + +The combination of structured support through sponsorship, educational content through video series, and interactive learning through the playground creates a robust ecosystem for both new and experienced users of Crawl4AI. diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d297dfca3731fe5c511deb5b82af496380a81bf0 --- /dev/null +++ b/crawl4ai/__init__.py @@ -0,0 +1,46 @@ +# __init__.py + +from .async_webcrawler import AsyncWebCrawler, CacheMode +from .async_configs import BrowserConfig, CrawlerRunConfig +from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy +from .chunking_strategy import ChunkingStrategy, RegexChunking +from .markdown_generation_strategy import DefaultMarkdownGenerator +from .content_filter_strategy import PruningContentFilter, BM25ContentFilter +from .models import CrawlResult +from .__version__ import __version__ + +__all__ = [ + "AsyncWebCrawler", + "CrawlResult", + "CacheMode", + 'BrowserConfig', + 'CrawlerRunConfig', + 'ExtractionStrategy', + 'LLMExtractionStrategy', + 'CosineStrategy', + 'JsonCssExtractionStrategy', + 'ChunkingStrategy', + 'RegexChunking', + 'DefaultMarkdownGenerator', + 'PruningContentFilter', + 'BM25ContentFilter', +] + +def is_sync_version_installed(): + try: + import selenium + return True + except ImportError: + return False + +if is_sync_version_installed(): + try: + from .web_crawler import WebCrawler + __all__.append("WebCrawler") + except ImportError: + import warnings + print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.") +else: + WebCrawler = None + # import warnings + # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.") \ No newline at end of file diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py new file mode 100644 index 0000000000000000000000000000000000000000..8ec3d0534916e11f58ed3d8394575ae1c201d389 --- /dev/null +++ b/crawl4ai/__version__.py @@ -0,0 +1,2 @@ +# crawl4ai/_version.py +__version__ = "0.4.247" diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..a4de071f2fda87fca0ffede64a6951fe85198679 --- /dev/null +++ b/crawl4ai/async_configs.py @@ -0,0 +1,603 @@ +from .config import ( + MIN_WORD_THRESHOLD, + IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, + SCREENSHOT_HEIGHT_TRESHOLD, + PAGE_TIMEOUT, + IMAGE_SCORE_THRESHOLD, + SOCIAL_MEDIA_DOMAINS, + +) +from .user_agent_generator import UserAgentGenerator +from .extraction_strategy import ExtractionStrategy +from .chunking_strategy import ChunkingStrategy +from .markdown_generation_strategy import MarkdownGenerationStrategy +from typing import Union, List + + +class BrowserConfig: + """ + Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy. + + This class centralizes all parameters that affect browser and context creation. Instead of passing + scattered keyword arguments, users can instantiate and modify this configuration object. The crawler + code will then reference these settings to initialize the browser in a consistent, documented manner. + + Attributes: + browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". + Default: "chromium". + headless (bool): Whether to run the browser in headless mode (no visible GUI). + Default: True. + use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing + advanced manipulation. Default: False. + debugging_port (int): Port for the browser debugging protocol. Default: 9222. + use_persistent_context (bool): Use a persistent browser context (like a persistent profile). + Automatically sets use_managed_browser=True. Default: False. + user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a + temporary directory may be used. Default: None. + chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type + is "chromium". Default: "chromium". + channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type + is "chromium". Default: "chromium". + proxy (str or None): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used. + Default: None. + proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. + If None, no additional proxy config. Default: None. + viewport_width (int): Default viewport width for pages. Default: 1080. + viewport_height (int): Default viewport height for pages. Default: 600. + verbose (bool): Enable verbose logging. + Default: True. + accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path. + Default: False. + downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True, + a default path will be created. Default: None. + storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage). + Default: None. + ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True. + java_script_enabled (bool): Enable JavaScript execution in pages. Default: True. + cookies (list): List of cookies to add to the browser context. Each cookie is a dict with fields like + {"name": "...", "value": "...", "url": "..."}. + Default: []. + headers (dict): Extra HTTP headers to apply to all requests in this context. + Default: {}. + user_agent (str): Custom User-Agent string to use. Default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36". + user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided + user_agent as-is. Default: None. + user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set. + Default: None. + text_mode (bool): If True, disables images and other rich content for potentially faster load times. + Default: False. + light_mode (bool): Disables certain background features for performance gains. Default: False. + extra_args (list): Additional command-line arguments passed to the browser. + Default: []. + """ + + def __init__( + self, + browser_type: str = "chromium", + headless: bool = True, + use_managed_browser: bool = False, + use_persistent_context: bool = False, + user_data_dir: str = None, + chrome_channel: str = "chromium", + channel: str = "chromium", + proxy: str = None, + proxy_config: dict = None, + viewport_width: int = 1080, + viewport_height: int = 600, + accept_downloads: bool = False, + downloads_path: str = None, + storage_state=None, + ignore_https_errors: bool = True, + java_script_enabled: bool = True, + sleep_on_close: bool = False, + verbose: bool = True, + cookies: list = None, + headers: dict = None, + user_agent: str = ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" + ), + user_agent_mode: str = None, + user_agent_generator_config: dict = None, + text_mode: bool = False, + light_mode: bool = False, + extra_args: list = None, + debugging_port : int = 9222, + ): + self.browser_type = browser_type + self.headless = headless + self.use_managed_browser = use_managed_browser + self.use_persistent_context = use_persistent_context + self.user_data_dir = user_data_dir + self.chrome_channel = chrome_channel or self.browser_type or "chromium" + self.channel = channel or self.browser_type or "chromium" + self.proxy = proxy + self.proxy_config = proxy_config + self.viewport_width = viewport_width + self.viewport_height = viewport_height + self.accept_downloads = accept_downloads + self.downloads_path = downloads_path + self.storage_state = storage_state + self.ignore_https_errors = ignore_https_errors + self.java_script_enabled = java_script_enabled + self.cookies = cookies if cookies is not None else [] + self.headers = headers if headers is not None else {} + self.user_agent = user_agent + self.user_agent_mode = user_agent_mode + self.user_agent_generator_config = user_agent_generator_config + self.text_mode = text_mode + self.light_mode = light_mode + self.extra_args = extra_args if extra_args is not None else [] + self.sleep_on_close = sleep_on_close + self.verbose = verbose + self.debugging_port = debugging_port + + user_agenr_generator = UserAgentGenerator() + if self.user_agent_mode != "random" and self.user_agent_generator_config: + self.user_agent = user_agenr_generator.generate( + **(self.user_agent_generator_config or {}) + ) + elif self.user_agent_mode == "random": + self.user_agent = user_agenr_generator.generate() + else: + pass + + self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) + self.headers.setdefault("sec-ch-ua", self.browser_hint) + + # If persistent context is requested, ensure managed browser is enabled + if self.use_persistent_context: + self.use_managed_browser = True + + @staticmethod + def from_kwargs(kwargs: dict) -> "BrowserConfig": + return BrowserConfig( + browser_type=kwargs.get("browser_type", "chromium"), + headless=kwargs.get("headless", True), + use_managed_browser=kwargs.get("use_managed_browser", False), + use_persistent_context=kwargs.get("use_persistent_context", False), + user_data_dir=kwargs.get("user_data_dir"), + chrome_channel=kwargs.get("chrome_channel", "chromium"), + channel=kwargs.get("channel", "chromium"), + proxy=kwargs.get("proxy"), + proxy_config=kwargs.get("proxy_config"), + viewport_width=kwargs.get("viewport_width", 1080), + viewport_height=kwargs.get("viewport_height", 600), + accept_downloads=kwargs.get("accept_downloads", False), + downloads_path=kwargs.get("downloads_path"), + storage_state=kwargs.get("storage_state"), + ignore_https_errors=kwargs.get("ignore_https_errors", True), + java_script_enabled=kwargs.get("java_script_enabled", True), + cookies=kwargs.get("cookies", []), + headers=kwargs.get("headers", {}), + user_agent=kwargs.get( + "user_agent", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", + ), + user_agent_mode=kwargs.get("user_agent_mode"), + user_agent_generator_config=kwargs.get("user_agent_generator_config"), + text_mode=kwargs.get("text_mode", False), + light_mode=kwargs.get("light_mode", False), + extra_args=kwargs.get("extra_args", []), + ) + + +class CrawlerRunConfig: + """ + Configuration class for controlling how the crawler runs each crawl operation. + This includes parameters for content extraction, page manipulation, waiting conditions, + caching, and other runtime behaviors. + + This centralizes parameters that were previously scattered as kwargs to `arun()` and related methods. + By using this class, you have a single place to understand and adjust the crawling options. + + Attributes: + # Content Processing Parameters + word_count_threshold (int): Minimum word count threshold before processing content. + Default: MIN_WORD_THRESHOLD (typically 200). + extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages. + Default: None (NoExtractionStrategy is used if None). + chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction. + Default: RegexChunking(). + markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown. + Default: None. + content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content. + Default: None. + only_text (bool): If True, attempt to extract text-only content where applicable. + Default: False. + css_selector (str or None): CSS selector to extract a specific portion of the page. + Default: None. + excluded_tags (list of str or None): List of HTML tags to exclude from processing. + Default: None. + excluded_selector (str or None): CSS selector to exclude from processing. + Default: None. + keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes. + Default: False. + remove_forms (bool): If True, remove all `
` elements from the HTML. + Default: False. + prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output. + Default: False. + parser_type (str): Type of parser to use for HTML parsing. + Default: "lxml". + + # Caching Parameters + cache_mode (CacheMode or None): Defines how caching is handled. + If None, defaults to CacheMode.ENABLED internally. + Default: None. + session_id (str or None): Optional session ID to persist the browser context and the created + page instance. If the ID already exists, the crawler does not + create a new page and uses the current page to preserve the state. + bypass_cache (bool): Legacy parameter, if True acts like CacheMode.BYPASS. + Default: False. + disable_cache (bool): Legacy parameter, if True acts like CacheMode.DISABLED. + Default: False. + no_cache_read (bool): Legacy parameter, if True acts like CacheMode.WRITE_ONLY. + Default: False. + no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY. + Default: False. + + # Page Navigation and Timing Parameters + wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded". + Default: "domcontentloaded". + page_timeout (int): Timeout in ms for page operations like navigation. + Default: 60000 (60 seconds). + wait_for (str or None): A CSS selector or JS condition to wait for before extracting content. + Default: None. + wait_for_images (bool): If True, wait for images to load before extracting content. + Default: False. + delay_before_return_html (float): Delay in seconds before retrieving final HTML. + Default: 0.1. + mean_delay (float): Mean base delay between requests when calling arun_many. + Default: 0.1. + max_range (float): Max random additional delay range for requests in arun_many. + Default: 0.3. + semaphore_count (int): Number of concurrent operations allowed. + Default: 5. + + # Page Interaction Parameters + js_code (str or list of str or None): JavaScript code/snippets to run on the page. + Default: None. + js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads. + Default: False. + ignore_body_visibility (bool): If True, ignore whether the body is visible before proceeding. + Default: True. + scan_full_page (bool): If True, scroll through the entire page to load all content. + Default: False. + scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True. + Default: 0.2. + process_iframes (bool): If True, attempts to process and inline iframe content. + Default: False. + remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML. + Default: False. + simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures. + Default: False. + override_navigator (bool): If True, overrides navigator properties for more human-like behavior. + Default: False. + magic (bool): If True, attempts automatic handling of overlays/popups. + Default: False. + adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions. + Default: False. + + # Media Handling Parameters + screenshot (bool): Whether to take a screenshot after crawling. + Default: False. + screenshot_wait_for (float or None): Additional wait time before taking a screenshot. + Default: None. + screenshot_height_threshold (int): Threshold for page height to decide screenshot strategy. + Default: SCREENSHOT_HEIGHT_TRESHOLD (from config, e.g. 20000). + pdf (bool): Whether to generate a PDF of the page. + Default: False. + image_description_min_word_threshold (int): Minimum words for image description extraction. + Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50). + image_score_threshold (int): Minimum score threshold for processing an image. + Default: IMAGE_SCORE_THRESHOLD (e.g., 3). + exclude_external_images (bool): If True, exclude all external images from processing. + Default: False. + + # Link and Domain Handling Parameters + exclude_social_media_domains (list of str): List of domains to exclude for social media links. + Default: SOCIAL_MEDIA_DOMAINS (from config). + exclude_external_links (bool): If True, exclude all external links from the results. + Default: False. + exclude_social_media_links (bool): If True, exclude links pointing to social media domains. + Default: False. + exclude_domains (list of str): List of specific domains to exclude from results. + Default: []. + + # Debugging and Logging Parameters + verbose (bool): Enable verbose logging. + Default: True. + log_console (bool): If True, log console messages from the page. + Default: False. + """ + + def __init__( + self, + # Content Processing Parameters + word_count_threshold: int = MIN_WORD_THRESHOLD, + extraction_strategy: ExtractionStrategy = None, + chunking_strategy: ChunkingStrategy = None, + markdown_generator: MarkdownGenerationStrategy = None, + content_filter=None, + only_text: bool = False, + css_selector: str = None, + excluded_tags: list = None, + excluded_selector: str = None, + keep_data_attributes: bool = False, + remove_forms: bool = False, + prettiify: bool = False, + parser_type: str = "lxml", + + # SSL Parameters + fetch_ssl_certificate: bool = False, + + # Caching Parameters + cache_mode=None, + session_id: str = None, + bypass_cache: bool = False, + disable_cache: bool = False, + no_cache_read: bool = False, + no_cache_write: bool = False, + + # Page Navigation and Timing Parameters + wait_until: str = "domcontentloaded", + page_timeout: int = PAGE_TIMEOUT, + wait_for: str = None, + wait_for_images: bool = False, + delay_before_return_html: float = 0.1, + mean_delay: float = 0.1, + max_range: float = 0.3, + semaphore_count: int = 5, + + # Page Interaction Parameters + js_code: Union[str, List[str]] = None, + js_only: bool = False, + ignore_body_visibility: bool = True, + scan_full_page: bool = False, + scroll_delay: float = 0.2, + process_iframes: bool = False, + remove_overlay_elements: bool = False, + simulate_user: bool = False, + override_navigator: bool = False, + magic: bool = False, + adjust_viewport_to_content: bool = False, + + # Media Handling Parameters + screenshot: bool = False, + screenshot_wait_for: float = None, + screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD, + pdf: bool = False, + image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, + image_score_threshold: int = IMAGE_SCORE_THRESHOLD, + exclude_external_images: bool = False, + + # Link and Domain Handling Parameters + exclude_social_media_domains: list = None, + exclude_external_links: bool = False, + exclude_social_media_links: bool = False, + exclude_domains: list = None, + + # Debugging and Logging Parameters + verbose: bool = True, + log_console: bool = False, + + url: str = None, + ): + self.url = url + + # Content Processing Parameters + self.word_count_threshold = word_count_threshold + self.extraction_strategy = extraction_strategy + self.chunking_strategy = chunking_strategy + self.markdown_generator = markdown_generator + self.content_filter = content_filter + self.only_text = only_text + self.css_selector = css_selector + self.excluded_tags = excluded_tags or [] + self.excluded_selector = excluded_selector or "" + self.keep_data_attributes = keep_data_attributes + self.remove_forms = remove_forms + self.prettiify = prettiify + self.parser_type = parser_type + + # SSL Parameters + self.fetch_ssl_certificate = fetch_ssl_certificate + + # Caching Parameters + self.cache_mode = cache_mode + self.session_id = session_id + self.bypass_cache = bypass_cache + self.disable_cache = disable_cache + self.no_cache_read = no_cache_read + self.no_cache_write = no_cache_write + + # Page Navigation and Timing Parameters + self.wait_until = wait_until + self.page_timeout = page_timeout + self.wait_for = wait_for + self.wait_for_images = wait_for_images + self.delay_before_return_html = delay_before_return_html + self.mean_delay = mean_delay + self.max_range = max_range + self.semaphore_count = semaphore_count + + # Page Interaction Parameters + self.js_code = js_code + self.js_only = js_only + self.ignore_body_visibility = ignore_body_visibility + self.scan_full_page = scan_full_page + self.scroll_delay = scroll_delay + self.process_iframes = process_iframes + self.remove_overlay_elements = remove_overlay_elements + self.simulate_user = simulate_user + self.override_navigator = override_navigator + self.magic = magic + self.adjust_viewport_to_content = adjust_viewport_to_content + + # Media Handling Parameters + self.screenshot = screenshot + self.screenshot_wait_for = screenshot_wait_for + self.screenshot_height_threshold = screenshot_height_threshold + self.pdf = pdf + self.image_description_min_word_threshold = image_description_min_word_threshold + self.image_score_threshold = image_score_threshold + self.exclude_external_images = exclude_external_images + + # Link and Domain Handling Parameters + self.exclude_social_media_domains = exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS + self.exclude_external_links = exclude_external_links + self.exclude_social_media_links = exclude_social_media_links + self.exclude_domains = exclude_domains or [] + + # Debugging and Logging Parameters + self.verbose = verbose + self.log_console = log_console + + # Validate type of extraction strategy and chunking strategy if they are provided + if self.extraction_strategy is not None and not isinstance( + self.extraction_strategy, ExtractionStrategy + ): + raise ValueError("extraction_strategy must be an instance of ExtractionStrategy") + if self.chunking_strategy is not None and not isinstance( + self.chunking_strategy, ChunkingStrategy + ): + raise ValueError("chunking_strategy must be an instance of ChunkingStrategy") + + # Set default chunking strategy if None + if self.chunking_strategy is None: + from .chunking_strategy import RegexChunking + self.chunking_strategy = RegexChunking() + + @staticmethod + def from_kwargs(kwargs: dict) -> "CrawlerRunConfig": + return CrawlerRunConfig( + # Content Processing Parameters + word_count_threshold=kwargs.get("word_count_threshold", 200), + extraction_strategy=kwargs.get("extraction_strategy"), + chunking_strategy=kwargs.get("chunking_strategy"), + markdown_generator=kwargs.get("markdown_generator"), + content_filter=kwargs.get("content_filter"), + only_text=kwargs.get("only_text", False), + css_selector=kwargs.get("css_selector"), + excluded_tags=kwargs.get("excluded_tags", []), + excluded_selector=kwargs.get("excluded_selector", ""), + keep_data_attributes=kwargs.get("keep_data_attributes", False), + remove_forms=kwargs.get("remove_forms", False), + prettiify=kwargs.get("prettiify", False), + parser_type=kwargs.get("parser_type", "lxml"), + + # SSL Parameters + fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False), + + # Caching Parameters + cache_mode=kwargs.get("cache_mode"), + session_id=kwargs.get("session_id"), + bypass_cache=kwargs.get("bypass_cache", False), + disable_cache=kwargs.get("disable_cache", False), + no_cache_read=kwargs.get("no_cache_read", False), + no_cache_write=kwargs.get("no_cache_write", False), + + # Page Navigation and Timing Parameters + wait_until=kwargs.get("wait_until", "domcontentloaded"), + page_timeout=kwargs.get("page_timeout", 60000), + wait_for=kwargs.get("wait_for"), + wait_for_images=kwargs.get("wait_for_images", False), + delay_before_return_html=kwargs.get("delay_before_return_html", 0.1), + mean_delay=kwargs.get("mean_delay", 0.1), + max_range=kwargs.get("max_range", 0.3), + semaphore_count=kwargs.get("semaphore_count", 5), + + # Page Interaction Parameters + js_code=kwargs.get("js_code"), + js_only=kwargs.get("js_only", False), + ignore_body_visibility=kwargs.get("ignore_body_visibility", True), + scan_full_page=kwargs.get("scan_full_page", False), + scroll_delay=kwargs.get("scroll_delay", 0.2), + process_iframes=kwargs.get("process_iframes", False), + remove_overlay_elements=kwargs.get("remove_overlay_elements", False), + simulate_user=kwargs.get("simulate_user", False), + override_navigator=kwargs.get("override_navigator", False), + magic=kwargs.get("magic", False), + adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False), + + # Media Handling Parameters + screenshot=kwargs.get("screenshot", False), + screenshot_wait_for=kwargs.get("screenshot_wait_for"), + screenshot_height_threshold=kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD), + pdf=kwargs.get("pdf", False), + image_description_min_word_threshold=kwargs.get("image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD), + image_score_threshold=kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD), + exclude_external_images=kwargs.get("exclude_external_images", False), + + # Link and Domain Handling Parameters + exclude_social_media_domains=kwargs.get("exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS), + exclude_external_links=kwargs.get("exclude_external_links", False), + exclude_social_media_links=kwargs.get("exclude_social_media_links", False), + exclude_domains=kwargs.get("exclude_domains", []), + + # Debugging and Logging Parameters + verbose=kwargs.get("verbose", True), + log_console=kwargs.get("log_console", False), + + url=kwargs.get("url"), + ) + + # Create a funciton returns dict of the object + def to_dict(self): + return { + "word_count_threshold": self.word_count_threshold, + "extraction_strategy": self.extraction_strategy, + "chunking_strategy": self.chunking_strategy, + "markdown_generator": self.markdown_generator, + "content_filter": self.content_filter, + "only_text": self.only_text, + "css_selector": self.css_selector, + "excluded_tags": self.excluded_tags, + "excluded_selector": self.excluded_selector, + "keep_data_attributes": self.keep_data_attributes, + "remove_forms": self.remove_forms, + "prettiify": self.prettiify, + "parser_type": self.parser_type, + "fetch_ssl_certificate": self.fetch_ssl_certificate, + "cache_mode": self.cache_mode, + "session_id": self.session_id, + "bypass_cache": self.bypass_cache, + "disable_cache": self.disable_cache, + "no_cache_read": self.no_cache_read, + "no_cache_write": self.no_cache_write, + "wait_until": self.wait_until, + "page_timeout": self.page_timeout, + "wait_for": self.wait_for, + "wait_for_images": self.wait_for_images, + "delay_before_return_html": self.delay_before_return_html, + "mean_delay": self.mean_delay, + "max_range": self.max_range, + "semaphore_count": self.semaphore_count, + "js_code": self.js_code, + "js_only": self.js_only, + "ignore_body_visibility": self.ignore_body_visibility, + "scan_full_page": self.scan_full_page, + "scroll_delay": self.scroll_delay, + "process_iframes": self.process_iframes, + "remove_overlay_elements": self.remove_overlay_elements, + "simulate_user": self.simulate_user, + "override_navigator": self.override_navigator, + "magic": self.magic, + "adjust_viewport_to_content": self.adjust_viewport_to_content, + "screenshot": self.screenshot, + "screenshot_wait_for": self.screenshot_wait_for, + "screenshot_height_threshold": self.screenshot_height_threshold, + "pdf": self.pdf, + "image_description_min_word_threshold": self.image_description_min_word_threshold, + "image_score_threshold": self.image_score_threshold, + "exclude_external_images": self.exclude_external_images, + "exclude_social_media_domains": self.exclude_social_media_domains, + "exclude_external_links": self.exclude_external_links, + "exclude_social_media_links": self.exclude_social_media_links, + "exclude_domains": self.exclude_domains, + "verbose": self.verbose, + "log_console": self.log_console, + "url": self.url, + } diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py new file mode 100644 index 0000000000000000000000000000000000000000..b879413cd659d395f4bd4554a2b037820abc02a4 --- /dev/null +++ b/crawl4ai/async_crawler_strategy.py @@ -0,0 +1,2191 @@ +import asyncio +import base64 +import time +from abc import ABC, abstractmethod +from typing import Callable, Dict, Any, List, Optional, Awaitable, Union +import os, sys, shutil +import tempfile, subprocess +from playwright.async_api import async_playwright, Page, Browser, Error, BrowserContext +from playwright.async_api import TimeoutError as PlaywrightTimeoutError +from io import BytesIO +from PIL import Image, ImageDraw, ImageFont +from pathlib import Path +from playwright.async_api import ProxySettings +from pydantic import BaseModel +import hashlib +import json +import uuid +from .js_snippet import load_js_script +from .models import AsyncCrawlResponse +from .utils import get_error_context +from .user_agent_generator import UserAgentGenerator +from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT +from .async_configs import BrowserConfig, CrawlerRunConfig +from .async_logger import AsyncLogger +from playwright_stealth import StealthConfig, stealth_async +from .ssl_certificate import SSLCertificate + +stealth_config = StealthConfig( + webdriver=True, + chrome_app=True, + chrome_csi=True, + chrome_load_times=True, + chrome_runtime=True, + navigator_languages=True, + navigator_plugins=True, + navigator_permissions=True, + webgl_vendor=True, + outerdimensions=True, + navigator_hardware_concurrency=True, + media_codecs=True, +) + +BROWSER_DISABLE_OPTIONS = [ + "--disable-background-networking", + "--disable-background-timer-throttling", + "--disable-backgrounding-occluded-windows", + "--disable-breakpad", + "--disable-client-side-phishing-detection", + "--disable-component-extensions-with-background-pages", + "--disable-default-apps", + "--disable-extensions", + "--disable-features=TranslateUI", + "--disable-hang-monitor", + "--disable-ipc-flooding-protection", + "--disable-popup-blocking", + "--disable-prompt-on-repost", + "--disable-sync", + "--force-color-profile=srgb", + "--metrics-recording-only", + "--no-first-run", + "--password-store=basic", + "--use-mock-keychain", +] + + +class ManagedBrowser: + """ + Manages the browser process and context. This class allows to connect to the browser using CDP protocol. + + Attributes: + browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". + Default: "chromium". + user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a + temporary directory may be used. Default: None. + headless (bool): Whether to run the browser in headless mode (no visible GUI). + Default: True. + browser_process (subprocess.Popen): The process object for the browser. + temp_dir (str): Temporary directory for user data if not provided. + debugging_port (int): Port for debugging the browser. + host (str): Host for debugging the browser. + + Methods: + start(): Starts the browser process and returns the CDP endpoint URL. + _get_browser_path(): Returns the browser executable path based on OS and browser type. + _get_browser_args(): Returns browser-specific command line arguments. + _get_user_data_dir(): Returns the user data directory path. + _cleanup(): Terminates the browser process and removes the temporary directory. + """ + + browser_type: str + user_data_dir: str + headless: bool + browser_process: subprocess.Popen + temp_dir: str + debugging_port: int + host: str + def __init__( + self, + browser_type: str = "chromium", + user_data_dir: Optional[str] = None, + headless: bool = False, + logger=None, + host: str = "localhost", + debugging_port: int = 9222, + ): + """ + Initialize the ManagedBrowser instance. + + Args: + browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". + Default: "chromium". + user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a + temporary directory may be used. Default: None. + headless (bool): Whether to run the browser in headless mode (no visible GUI). + Default: True. + logger (logging.Logger): Logger instance for logging messages. Default: None. + host (str): Host for debugging the browser. Default: "localhost". + debugging_port (int): Port for debugging the browser. Default: 9222. + """ + self.browser_type = browser_type + self.user_data_dir = user_data_dir + self.headless = headless + self.browser_process = None + self.temp_dir = None + self.debugging_port = debugging_port + self.host = host + self.logger = logger + self.shutting_down = False + + async def start(self) -> str: + """ + Starts the browser process and returns the CDP endpoint URL. + If user_data_dir is not provided, creates a temporary directory. + """ + + # Create temp dir if needed + if not self.user_data_dir: + self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") + self.user_data_dir = self.temp_dir + + # Get browser path and args based on OS and browser type + browser_path = self._get_browser_path() + args = self._get_browser_args() + + # Start browser process + try: + self.browser_process = subprocess.Popen( + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + # Monitor browser process output for errors + asyncio.create_task(self._monitor_browser_process()) + await asyncio.sleep(2) # Give browser time to start + return f"http://{self.host}:{self.debugging_port}" + except Exception as e: + await self.cleanup() + raise Exception(f"Failed to start browser: {e}") + + async def _monitor_browser_process(self): + """ + Monitor the browser process for unexpected termination. + + How it works: + 1. Read stdout and stderr from the browser process. + 2. If the process has terminated, log the error message and terminate the browser. + 3. If the shutting_down flag is set, log the normal termination message. + 4. If any other error occurs, log the error message. + + Note: This method should be called in a separate task to avoid blocking the main event loop. + """ + if self.browser_process: + try: + stdout, stderr = await asyncio.gather( + asyncio.to_thread(self.browser_process.stdout.read), + asyncio.to_thread(self.browser_process.stderr.read), + ) + + # Check shutting_down flag BEFORE logging anything + if self.browser_process.poll() is not None: + if not self.shutting_down: + self.logger.error( + message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", + tag="ERROR", + params={ + "code": self.browser_process.returncode, + "stdout": stdout.decode(), + "stderr": stderr.decode(), + }, + ) + await self.cleanup() + else: + self.logger.info( + message="Browser process terminated normally | Code: {code}", + tag="INFO", + params={"code": self.browser_process.returncode}, + ) + except Exception as e: + if not self.shutting_down: + self.logger.error( + message="Error monitoring browser process: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + def _get_browser_path(self) -> str: + """Returns the browser executable path based on OS and browser type""" + if sys.platform == "darwin": # macOS + paths = { + "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", + "webkit": "/Applications/Safari.app/Contents/MacOS/Safari", + } + elif sys.platform == "win32": # Windows + paths = { + "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", + "webkit": None, # WebKit not supported on Windows + } + else: # Linux + paths = { + "chromium": "google-chrome", + "firefox": "firefox", + "webkit": None, # WebKit not supported on Linux + } + + return paths.get(self.browser_type) + + def _get_browser_args(self) -> List[str]: + """Returns browser-specific command line arguments""" + base_args = [self._get_browser_path()] + + if self.browser_type == "chromium": + args = [ + f"--remote-debugging-port={self.debugging_port}", + f"--user-data-dir={self.user_data_dir}", + ] + if self.headless: + args.append("--headless=new") + elif self.browser_type == "firefox": + args = [ + "--remote-debugging-port", + str(self.debugging_port), + "--profile", + self.user_data_dir, + ] + if self.headless: + args.append("--headless") + else: + raise NotImplementedError(f"Browser type {self.browser_type} not supported") + + return base_args + args + + async def cleanup(self): + """Cleanup browser process and temporary directory""" + # Set shutting_down flag BEFORE any termination actions + self.shutting_down = True + + if self.browser_process: + try: + self.browser_process.terminate() + # Wait for process to end gracefully + for _ in range(10): # 10 attempts, 100ms each + if self.browser_process.poll() is not None: + break + await asyncio.sleep(0.1) + + # Force kill if still running + if self.browser_process.poll() is None: + self.browser_process.kill() + await asyncio.sleep(0.1) # Brief wait for kill to take effect + + except Exception as e: + self.logger.error( + message="Error terminating browser: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + if self.temp_dir and os.path.exists(self.temp_dir): + try: + shutil.rmtree(self.temp_dir) + except Exception as e: + self.logger.error( + message="Error removing temporary directory: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + +class BrowserManager: + """ + Manages the browser instance and context. + + Attributes: + config (BrowserConfig): Configuration object containing all browser settings + logger: Logger instance for recording events and errors + browser (Browser): The browser instance + default_context (BrowserContext): The default browser context + managed_browser (ManagedBrowser): The managed browser instance + playwright (Playwright): The Playwright instance + sessions (dict): Dictionary to store session information + session_ttl (int): Session timeout in seconds + """ + def __init__(self, browser_config: BrowserConfig, logger=None): + """ + Initialize the BrowserManager with a browser configuration. + + Args: + browser_config (BrowserConfig): Configuration object containing all browser settings + logger: Logger instance for recording events and errors + """ + self.config: BrowserConfig = browser_config + self.logger = logger + + # Browser state + self.browser = None + self.default_context = None + self.managed_browser = None + self.playwright = None + + # Session management + self.sessions = {} + self.session_ttl = 1800 # 30 minutes + + # Initialize ManagedBrowser if needed + if self.config.use_managed_browser: + self.managed_browser = ManagedBrowser( + browser_type=self.config.browser_type, + user_data_dir=self.config.user_data_dir, + headless=self.config.headless, + logger=self.logger, + debugging_port=self.config.debugging_port, + ) + + async def start(self): + """ + Start the browser instance and set up the default context. + + How it works: + 1. Check if Playwright is already initialized. + 2. If not, initialize Playwright. + 3. If managed browser is used, start it and connect to the CDP endpoint. + 4. If managed browser is not used, launch the browser and set up the default context. + + Note: This method should be called in a separate task to avoid blocking the main event loop. + """ + if self.playwright is None: + from playwright.async_api import async_playwright + + self.playwright = await async_playwright().start() + + if self.config.use_managed_browser: + cdp_url = await self.managed_browser.start() + self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) + contexts = self.browser.contexts + if contexts: + self.default_context = contexts[0] + else: + self.default_context = await self.create_browser_context() + # self.default_context = await self.browser.new_context( + # viewport={ + # "width": self.config.viewport_width, + # "height": self.config.viewport_height, + # }, + # storage_state=self.config.storage_state, + # user_agent=self.config.headers.get( + # "User-Agent", self.config.user_agent + # ), + # accept_downloads=self.config.accept_downloads, + # ignore_https_errors=self.config.ignore_https_errors, + # java_script_enabled=self.config.java_script_enabled, + # ) + await self.setup_context(self.default_context) + else: + browser_args = self._build_browser_args() + + # Launch appropriate browser type + if self.config.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.config.browser_type == "webkit": + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + + self.default_context = self.browser + + def _build_browser_args(self) -> dict: + """Build browser launch arguments from config.""" + args = [ + "--disable-gpu", + "--disable-gpu-compositing", + "--disable-software-rasterizer", + "--no-sandbox", + "--disable-dev-shm-usage", + "--no-first-run", + "--no-default-browser-check", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + "--disable-blink-features=AutomationControlled", + "--window-position=400,0", + "--disable-renderer-backgrounding", + "--disable-ipc-flooding-protection", + "--force-color-profile=srgb", + "--mute-audio", + "--disable-background-timer-throttling", + # "--single-process", + f"--window-size={self.config.viewport_width},{self.config.viewport_height}", + ] + + if self.config.light_mode: + args.extend(BROWSER_DISABLE_OPTIONS) + + if self.config.text_mode: + args.extend( + [ + "--blink-settings=imagesEnabled=false", + "--disable-remote-fonts", + "--disable-images", + "--disable-javascript", + "--disable-software-rasterizer", + "--disable-dev-shm-usage", + ] + ) + + if self.config.extra_args: + args.extend(self.config.extra_args) + + browser_args = {"headless": self.config.headless, "args": args} + + if self.config.chrome_channel: + browser_args["channel"] = self.config.chrome_channel + + if self.config.accept_downloads: + browser_args["downloads_path"] = self.config.downloads_path or os.path.join( + os.getcwd(), "downloads" + ) + os.makedirs(browser_args["downloads_path"], exist_ok=True) + + if self.config.proxy or self.config.proxy_config: + from playwright.async_api import ProxySettings + + proxy_settings = ( + ProxySettings(server=self.config.proxy) + if self.config.proxy + else ProxySettings( + server=self.config.proxy_config.get("server"), + username=self.config.proxy_config.get("username"), + password=self.config.proxy_config.get("password"), + ) + ) + browser_args["proxy"] = proxy_settings + + return browser_args + + async def setup_context( + self, + context: BrowserContext, + crawlerRunConfig: CrawlerRunConfig, + is_default=False, + ): + """ + Set up a browser context with the configured options. + + How it works: + 1. Set extra HTTP headers if provided. + 2. Add cookies if provided. + 3. Load storage state if provided. + 4. Accept downloads if enabled. + 5. Set default timeouts for navigation and download. + 6. Set user agent if provided. + 7. Set browser hints if provided. + 8. Set proxy if provided. + 9. Set downloads path if provided. + 10. Set storage state if provided. + 11. Set cache if provided. + 12. Set extra HTTP headers if provided. + 13. Add cookies if provided. + 14. Set default timeouts for navigation and download if enabled. + 15. Set user agent if provided. + 16. Set browser hints if provided. + + Args: + context (BrowserContext): The browser context to set up + crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings + is_default (bool): Flag indicating if this is the default context + Returns: + None + """ + if self.config.headers: + await context.set_extra_http_headers(self.config.headers) + + if self.config.cookies: + await context.add_cookies(self.config.cookies) + + if self.config.storage_state: + await context.storage_state(path=None) + + if self.config.accept_downloads: + context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT) + context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) + if self.config.downloads_path: + context._impl_obj._options["accept_downloads"] = True + context._impl_obj._options["downloads_path"] = ( + self.config.downloads_path + ) + + # Handle user agent and browser hints + if self.config.user_agent: + combined_headers = { + "User-Agent": self.config.user_agent, + "sec-ch-ua": self.config.browser_hint, + } + combined_headers.update(self.config.headers) + await context.set_extra_http_headers(combined_headers) + + # Add default cookie + await context.add_cookies( + [{"name": "cookiesEnabled", "value": "true", "url": crawlerRunConfig.url}] + ) + + # Handle navigator overrides + if ( + crawlerRunConfig.override_navigator + or crawlerRunConfig.simulate_user + or crawlerRunConfig.magic + ): + await context.add_init_script(load_js_script("navigator_overrider")) + + async def create_browser_context(self): + """ + Creates and returns a new browser context with configured settings. + Applies text-only mode settings if text_mode is enabled in config. + + Returns: + Context: Browser context object with the specified configurations + """ + # Base settings + user_agent = self.config.headers.get("User-Agent", self.config.user_agent) + viewport_settings = { + "width": self.config.viewport_width, + "height": self.config.viewport_height, + } + proxy_settings = {"server": self.config.proxy} if self.config.proxy else None + + blocked_extensions = [ + # Images + 'jpg', 'jpeg', 'png', 'gif', 'webp', 'svg', 'ico', 'bmp', 'tiff', 'psd', + # Fonts + 'woff', 'woff2', 'ttf', 'otf', 'eot', + # Styles + # 'css', 'less', 'scss', 'sass', + # Media + 'mp4', 'webm', 'ogg', 'avi', 'mov', 'wmv', 'flv', 'm4v', + 'mp3', 'wav', 'aac', 'm4a', 'opus', 'flac', + # Documents + 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx', + # Archives + 'zip', 'rar', '7z', 'tar', 'gz', + # Scripts and data + 'xml', 'swf', 'wasm' + ] + + # Common context settings + context_settings = { + "user_agent": user_agent, + "viewport": viewport_settings, + "proxy": proxy_settings, + "accept_downloads": self.config.accept_downloads, + "storage_state": self.config.storage_state, + "ignore_https_errors": self.config.ignore_https_errors, + "device_scale_factor": 1.0, + "java_script_enabled": self.config.java_script_enabled, + } + + if self.config.text_mode: + text_mode_settings = { + "has_touch": False, + "is_mobile": False, + } + # Update context settings with text mode settings + context_settings.update(text_mode_settings) + + # Create and return the context with all settings + context = await self.browser.new_context(**context_settings) + + # Apply text mode settings if enabled + if self.config.text_mode: + # Create and apply route patterns for each extension + for ext in blocked_extensions: + await context.route(f"**/*.{ext}", lambda route: route.abort()) + return context + + # async def get_page(self, session_id: Optional[str], user_agent: str): + async def get_page(self, crawlerRunConfig: CrawlerRunConfig): + """ + Get a page for the given session ID, creating a new one if needed. + + Args: + crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings + + Returns: + Page: The page object for the given session ID. + BrowserContext: The browser context for the given session ID. + """ + self._cleanup_expired_sessions() + + if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: + context, page, _ = self.sessions[crawlerRunConfig.session_id] + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + return page, context + + if self.config.use_managed_browser: + context = self.default_context + page = await context.new_page() + else: + context = await self.create_browser_context() + await self.setup_context(context, crawlerRunConfig) + page = await context.new_page() + + if crawlerRunConfig.session_id: + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) + + return page, context + + async def kill_session(self, session_id: str): + """ + Kill a browser session and clean up resources. + + Args: + session_id (str): The session ID to kill. + """ + if session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + if not self.config.use_managed_browser: + await context.close() + del self.sessions[session_id] + + def _cleanup_expired_sessions(self): + """Clean up expired sessions based on TTL.""" + current_time = time.time() + expired_sessions = [ + sid + for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self.kill_session(sid)) + + async def close(self): + """Close all browser resources and clean up.""" + if self.config.sleep_on_close: + await asyncio.sleep(0.5) + + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self.kill_session(session_id) + + if self.browser: + await self.browser.close() + self.browser = None + + if self.managed_browser: + await asyncio.sleep(0.5) + await self.managed_browser.cleanup() + self.managed_browser = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None + + +class AsyncCrawlerStrategy(ABC): + """ + Abstract base class for crawler strategies. + Subclasses must implement the crawl method. + """ + @abstractmethod + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + pass # 4 + 3 + + + +class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + """ + Crawler strategy using Playwright. + + Attributes: + browser_config (BrowserConfig): Configuration object containing browser settings. + logger (AsyncLogger): Logger instance for recording events and errors. + _downloaded_files (List[str]): List of downloaded file paths. + hooks (Dict[str, Callable]): Dictionary of hooks for custom behavior. + browser_manager (BrowserManager): Manager for browser creation and management. + + Methods: + __init__(self, browser_config=None, logger=None, **kwargs): + Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration. + __aenter__(self): + Start the browser and initialize the browser manager. + __aexit__(self, exc_type, exc_val, exc_tb): + Close the browser and clean up resources. + start(self): + Start the browser and initialize the browser manager. + close(self): + Close the browser and clean up resources. + kill_session(self, session_id): + Kill a browser session and clean up resources. + crawl(self, url, **kwargs): + Run the crawler for a single URL. + + """ + def __init__( + self, browser_config: BrowserConfig = None, logger: AsyncLogger = None, **kwargs + ): + """ + Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration. + + Args: + browser_config (BrowserConfig): Configuration object containing browser settings. + If None, will be created from kwargs for backwards compatibility. + logger: Logger instance for recording events and errors. + **kwargs: Additional arguments for backwards compatibility and extending functionality. + """ + # Initialize browser config, either from provided object or kwargs + self.browser_config = browser_config or BrowserConfig.from_kwargs(kwargs) + self.logger = logger + + # Initialize session management + self._downloaded_files = [] + + # Initialize hooks system + self.hooks = { + "on_browser_created": None, + "on_page_context_created": None, + "on_user_agent_updated": None, + "on_execution_started": None, + "before_goto": None, + "after_goto": None, + "before_return_html": None, + "before_retrieve_html": None, + } + + # Initialize browser manager with config + self.browser_manager = BrowserManager( + browser_config=self.browser_config, logger=self.logger + ) + + async def __aenter__(self): + await self.start() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.close() + + async def start(self): + """ + Start the browser and initialize the browser manager. + """ + await self.browser_manager.start() + await self.execute_hook( + "on_browser_created", + self.browser_manager.browser, + context=self.browser_manager.default_context, + ) + + async def close(self): + """ + Close the browser and clean up resources. + """ + await self.browser_manager.close() + + async def kill_session(self, session_id: str): + """ + Kill a browser session and clean up resources. + + Args: + session_id (str): The ID of the session to kill. + + Returns: + None + """ + # Log a warning message and no need kill session, in new version auto kill session + self.logger.warning( + message="Session auto-kill is enabled in the new version. No need to manually kill sessions.", + tag="WARNING", + ) + await self.browser_manager.kill_session(session_id) + + def set_hook(self, hook_type: str, hook: Callable): + """ + Set a hook function for a specific hook type. Following are list of hook types: + - on_browser_created: Called when a new browser instance is created. + - on_page_context_created: Called when a new page context is created. + - on_user_agent_updated: Called when the user agent is updated. + - on_execution_started: Called when the execution starts. + - before_goto: Called before a goto operation. + - after_goto: Called after a goto operation. + - before_return_html: Called before returning HTML content. + - before_retrieve_html: Called before retrieving HTML content. + + All hooks except on_browser_created accepts a context and a page as arguments and **kwargs. However, on_browser_created accepts a browser and a context as arguments and **kwargs. + + Args: + hook_type (str): The type of the hook. + hook (Callable): The hook function to set. + + Returns: + None + """ + if hook_type in self.hooks: + self.hooks[hook_type] = hook + else: + raise ValueError(f"Invalid hook type: {hook_type}") + + async def execute_hook(self, hook_type: str, *args, **kwargs): + """ + Execute a hook function for a specific hook type. + + Args: + hook_type (str): The type of the hook. + *args: Variable length positional arguments. + **kwargs: Keyword arguments. + + Returns: + The return value of the hook function, if any. + """ + hook = self.hooks.get(hook_type) + if hook: + if asyncio.iscoroutinefunction(hook): + return await hook(*args, **kwargs) + else: + return hook(*args, **kwargs) + return args[0] if args else None + + def update_user_agent(self, user_agent: str): + """ + Update the user agent for the browser. + + Args: + user_agent (str): The new user agent string. + + Returns: + None + """ + self.user_agent = user_agent + + def set_custom_headers(self, headers: Dict[str, str]): + """ + Set custom headers for the browser. + + Args: + headers (Dict[str, str]): A dictionary of headers to set. + + Returns: + None + """ + self.headers = headers + + async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): + """ + Wait for a condition in a smart way. This functions works as below: + + 1. If wait_for starts with 'js:', it assumes it's a JavaScript function and waits for it to return true. + 2. If wait_for starts with 'css:', it assumes it's a CSS selector and waits for it to be present. + 3. Otherwise, it tries to evaluate wait_for as a JavaScript function and waits for it to return true. + 4. If it's not a JavaScript function, it assumes it's a CSS selector and waits for it to be present. + + This is a more advanced version of the wait_for parameter in CrawlerStrategy.crawl(). + Args: + page: Playwright page object + wait_for (str): The condition to wait for. Can be a CSS selector, a JavaScript function, or explicitly prefixed with 'js:' or 'css:'. + timeout (float): Maximum time to wait in milliseconds + + Returns: + None + """ + wait_for = wait_for.strip() + + if wait_for.startswith("js:"): + # Explicitly specified JavaScript + js_code = wait_for[3:].strip() + return await self.csp_compliant_wait(page, js_code, timeout) + elif wait_for.startswith("css:"): + # Explicitly specified CSS selector + css_selector = wait_for[4:].strip() + try: + await page.wait_for_selector(css_selector, timeout=timeout) + except Error as e: + if "Timeout" in str(e): + raise TimeoutError( + f"Timeout after {timeout}ms waiting for selector '{css_selector}'" + ) + else: + raise ValueError(f"Invalid CSS selector: '{css_selector}'") + else: + # Auto-detect based on content + if wait_for.startswith("()") or wait_for.startswith("function"): + # It's likely a JavaScript function + return await self.csp_compliant_wait(page, wait_for, timeout) + else: + # Assume it's a CSS selector first + try: + await page.wait_for_selector(wait_for, timeout=timeout) + except Error as e: + if "Timeout" in str(e): + raise TimeoutError( + f"Timeout after {timeout}ms waiting for selector '{wait_for}'" + ) + else: + # If it's not a timeout error, it might be an invalid selector + # Let's try to evaluate it as a JavaScript function as a fallback + try: + return await self.csp_compliant_wait( + page, f"() => {{{wait_for}}}", timeout + ) + except Error: + raise ValueError( + f"Invalid wait_for parameter: '{wait_for}'. " + "It should be either a valid CSS selector, a JavaScript function, " + "or explicitly prefixed with 'js:' or 'css:'." + ) + + async def csp_compliant_wait( self, page: Page, user_wait_function: str, timeout: float = 30000 ): + """ + Wait for a condition in a CSP-compliant way. + + Args: + page: Playwright page object + user_wait_function: JavaScript function as string that returns boolean + timeout: Maximum time to wait in milliseconds + + Returns: + bool: True if condition was met, False if timed out + + Raises: + RuntimeError: If there's an error evaluating the condition + """ + wrapper_js = f""" + async () => {{ + const userFunction = {user_wait_function}; + const startTime = Date.now(); + try {{ + while (true) {{ + if (await userFunction()) {{ + return true; + }} + if (Date.now() - startTime > {timeout}) {{ + return false; // Return false instead of throwing + }} + await new Promise(resolve => setTimeout(resolve, 100)); + }} + }} catch (error) {{ + throw new Error(`Error evaluating condition: ${{error.message}}`); + }} + }} + """ + + try: + result = await page.evaluate(wrapper_js) + return result + except Exception as e: + if "Error evaluating condition" in str(e): + raise RuntimeError(f"Failed to evaluate wait condition: {str(e)}") + # For timeout or other cases, just return False + return False + + async def process_iframes(self, page): + """ + Process iframes on a page. This function will extract the content of each iframe and replace it with a div containing the extracted content. + + Args: + page: Playwright page object + + Returns: + Playwright page object + """ + # Find all iframes + iframes = await page.query_selector_all("iframe") + + for i, iframe in enumerate(iframes): + try: + # Add a unique identifier to the iframe + await iframe.evaluate(f'(element) => element.id = "iframe-{i}"') + + # Get the frame associated with this iframe + frame = await iframe.content_frame() + + if frame: + # Wait for the frame to load + await frame.wait_for_load_state( + "load", timeout=30000 + ) # 30 seconds timeout + + # Extract the content of the iframe's body + iframe_content = await frame.evaluate( + "() => document.body.innerHTML" + ) + + # Generate a unique class name for this iframe + class_name = f"extracted-iframe-content-{i}" + + # Replace the iframe with a div containing the extracted content + _iframe = iframe_content.replace("`", "\\`") + await page.evaluate( + f""" + () => {{ + const iframe = document.getElementById('iframe-{i}'); + const div = document.createElement('div'); + div.innerHTML = `{_iframe}`; + div.className = '{class_name}'; + iframe.replaceWith(div); + }} + """ + ) + else: + self.logger.warning( + message="Could not access content frame for iframe {index}", + tag="SCRAPE", + params={"index": i}, + ) + except Exception as e: + self.logger.error( + message="Error processing iframe {index}: {error}", + tag="ERROR", + params={"index": i, "error": str(e)}, + ) + + # Return the page object + return page + + async def create_session(self, **kwargs) -> str: + """ + Creates a new browser session and returns its ID. A browse session is a unique openned page can be reused for multiple crawls. + This function is asynchronous and returns a string representing the session ID. + + Args: + **kwargs: Optional keyword arguments to configure the session. + + Returns: + str: The session ID. + """ + await self.start() + + session_id = kwargs.get("session_id") or str(uuid.uuid4()) + + user_agent = kwargs.get("user_agent", self.user_agent) + # Use browser_manager to get a fresh page & context assigned to this session_id + page, context = await self.browser_manager.get_page(session_id, user_agent) + return session_id + + async def crawl( self, url: str, config: CrawlerRunConfig, **kwargs ) -> AsyncCrawlResponse: + """ + Crawls a given URL or processes raw HTML/local file content based on the URL prefix. + + Args: + url (str): The URL to crawl. Supported prefixes: + - 'http://' or 'https://': Web URL to crawl. + - 'file://': Local file path to process. + - 'raw://': Raw HTML content to process. + **kwargs: Additional parameters: + - 'screenshot' (bool): Whether to take a screenshot. + - ... [other existing parameters] + + Returns: + AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. + """ + config = config or CrawlerRunConfig.from_kwargs(kwargs) + response_headers = {} + status_code = 200 # Default for local/raw HTML + screenshot_data = None + + if url.startswith(("http://", "https://")): + return await self._crawl_web(url, config) + + elif url.startswith("file://"): + # Process local file + local_file_path = url[7:] # Remove 'file://' prefix + if not os.path.exists(local_file_path): + raise FileNotFoundError(f"Local file not found: {local_file_path}") + with open(local_file_path, "r", encoding="utf-8") as f: + html = f.read() + if config.screenshot: + screenshot_data = await self._generate_screenshot_from_html(html) + return AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=None, + ) + + elif url.startswith("raw:") or url.startswith("raw://"): + # Process raw HTML content + raw_html = url[4:] if url[:4] == "raw:" else url[7:] + html = raw_html + if config.screenshot: + screenshot_data = await self._generate_screenshot_from_html(html) + return AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=None, + ) + else: + raise ValueError( + "URL must start with 'http://', 'https://', 'file://', or 'raw:'" + ) + + async def _crawl_web( self, url: str, config: CrawlerRunConfig ) -> AsyncCrawlResponse: + """ + Internal method to crawl web URLs with the specified configuration. + + Args: + url (str): The web URL to crawl + config (CrawlerRunConfig): Configuration object controlling the crawl behavior + + Returns: + AsyncCrawlResponse: The response containing HTML, headers, status code, and optional data + """ + config.url = url + response_headers = {} + status_code = None + + # Reset downloaded files list for new crawl + self._downloaded_files = [] + + # Handle user agent with magic mode + user_agent = self.browser_config.user_agent + if config.magic and self.browser_config.user_agent_mode != "random": + self.browser_config.user_agent = UserAgentGenerator().generate( + **(self.browser_config.user_agent_generator_config or {}) + ) + + # Get page for session + page, context = await self.browser_manager.get_page(crawlerRunConfig=config) + + # Add default cookie + await context.add_cookies( + [{"name": "cookiesEnabled", "value": "true", "url": url}] + ) + + # Handle navigator overrides + if config.override_navigator or config.simulate_user or config.magic: + await context.add_init_script(load_js_script("navigator_overrider")) + + # Call hook after page creation + await self.execute_hook("on_page_context_created", page, context=context) + + # Set up console logging if requested + if config.log_console: + + def log_consol( + msg, console_log_type="debug" + ): # Corrected the parameter syntax + if console_log_type == "error": + self.logger.error( + message=f"Console error: {msg}", # Use f-string for variable interpolation + tag="CONSOLE", + params={"msg": msg.text}, + ) + elif console_log_type == "debug": + self.logger.debug( + message=f"Console: {msg}", # Use f-string for variable interpolation + tag="CONSOLE", + params={"msg": msg.text}, + ) + + page.on("console", log_consol) + page.on("pageerror", lambda e: log_consol(e, "error")) + + try: + # Get SSL certificate information if requested and URL is HTTPS + ssl_cert = None + if config.fetch_ssl_certificate: + ssl_cert = SSLCertificate.from_url(url) + + # Set up download handling + if self.browser_config.accept_downloads: + page.on( + "download", + lambda download: asyncio.create_task( + self._handle_download(download) + ), + ) + + # Handle page navigation and content loading + if not config.js_only: + await self.execute_hook("before_goto", page, context=context, url=url) + + try: + # Generate a unique nonce for this request + nonce = hashlib.sha256(os.urandom(32)).hexdigest() + + # Add CSP headers to the request + await page.set_extra_http_headers({ + 'Content-Security-Policy': f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'" + }) + + response = await page.goto( + url, wait_until=config.wait_until, timeout=config.page_timeout + ) + except Error as e: + raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") + + await self.execute_hook("after_goto", page, context=context, url=url, response=response) + + if response is None: + status_code = 200 + response_headers = {} + else: + status_code = response.status + response_headers = response.headers + + else: + status_code = 200 + response_headers = {} + + # Wait for body element and visibility + try: + await page.wait_for_selector("body", state="attached", timeout=30000) + + # Use the new check_visibility function with csp_compliant_wait + is_visible = await self.csp_compliant_wait( + page, + """() => { + const element = document.body; + if (!element) return false; + const style = window.getComputedStyle(element); + const isVisible = style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + return isVisible; + }""", + timeout=30000 + ) + + if not is_visible and not config.ignore_body_visibility: + visibility_info = await self.check_visibility(page) + raise Error(f"Body element is hidden: {visibility_info}") + + except Error as e: + visibility_info = await self.check_visibility(page) + + if self.config.verbose: + self.logger.debug( + message="Body visibility info: {info}", + tag="DEBUG", + params={"info": visibility_info}, + ) + + if not config.ignore_body_visibility: + raise Error(f"Body element is hidden: {visibility_info}") + + + # try: + # await page.wait_for_selector("body", state="attached", timeout=30000) + + # await page.wait_for_function( + # """ + # () => { + # const body = document.body; + # const style = window.getComputedStyle(body); + # return style.display !== 'none' && + # style.visibility !== 'hidden' && + # style.opacity !== '0'; + # } + # """, + # timeout=30000, + # ) + # except Error as e: + # visibility_info = await page.evaluate( + # """ + # () => { + # const body = document.body; + # const style = window.getComputedStyle(body); + # return { + # display: style.display, + # visibility: style.visibility, + # opacity: style.opacity, + # hasContent: body.innerHTML.length, + # classList: Array.from(body.classList) + # } + # } + # """ + # ) + + # if self.config.verbose: + # self.logger.debug( + # message="Body visibility info: {info}", + # tag="DEBUG", + # params={"info": visibility_info}, + # ) + + # if not config.ignore_body_visibility: + # raise Error(f"Body element is hidden: {visibility_info}") + + # Handle content loading and viewport adjustment + if not self.browser_config.text_mode and ( + config.wait_for_images or config.adjust_viewport_to_content + ): + await page.wait_for_load_state("domcontentloaded") + await asyncio.sleep(0.1) + + # Check for image loading with improved error handling + images_loaded = await self.csp_compliant_wait( + page, + "() => Array.from(document.getElementsByTagName('img')).every(img => img.complete)", + timeout=1000 + ) + + if not images_loaded and self.logger: + self.logger.warning( + message="Some images failed to load within timeout", + tag="SCRAPE", + ) + + # Adjust viewport if needed + if not self.browser_config.text_mode and config.adjust_viewport_to_content: + try: + dimensions = await self.get_page_dimensions(page) + page_height = dimensions['height'] + page_width = dimensions['width'] + # page_width = await page.evaluate( + # "document.documentElement.scrollWidth" + # ) + # page_height = await page.evaluate( + # "document.documentElement.scrollHeight" + # ) + + target_width = self.browser_config.viewport_width + target_height = int(target_width * page_width / page_height * 0.95) + await page.set_viewport_size( + {"width": target_width, "height": target_height} + ) + + scale = min(target_width / page_width, target_height / page_height) + cdp = await page.context.new_cdp_session(page) + await cdp.send( + "Emulation.setDeviceMetricsOverride", + { + "width": page_width, + "height": page_height, + "deviceScaleFactor": 1, + "mobile": False, + "scale": scale, + }, + ) + except Exception as e: + self.logger.warning( + message="Failed to adjust viewport to content: {error}", + tag="VIEWPORT", + params={"error": str(e)}, + ) + + # Handle full page scanning + if config.scan_full_page: + await self._handle_full_page_scan(page, config.scroll_delay) + + # Execute JavaScript if provided + # if config.js_code: + # if isinstance(config.js_code, str): + # await page.evaluate(config.js_code) + # elif isinstance(config.js_code, list): + # for js in config.js_code: + # await page.evaluate(js) + + if config.js_code: + # execution_result = await self.execute_user_script(page, config.js_code) + execution_result = await self.robust_execute_user_script(page, config.js_code) + if not execution_result["success"]: + self.logger.warning( + message="User script execution had issues: {error}", + tag="JS_EXEC", + params={"error": execution_result.get("error")} + ) + + await self.execute_hook("on_execution_started", page, context=context) + + # Handle user simulation + if config.simulate_user or config.magic: + await page.mouse.move(100, 100) + await page.mouse.down() + await page.mouse.up() + await page.keyboard.press("ArrowDown") + + # Handle wait_for condition + if config.wait_for: + try: + await self.smart_wait( + page, config.wait_for, timeout=config.page_timeout + ) + except Exception as e: + raise RuntimeError(f"Wait condition failed: {str(e)}") + + # Update image dimensions if needed + if not self.browser_config.text_mode: + update_image_dimensions_js = load_js_script("update_image_dimensions") + try: + try: + await page.wait_for_load_state("domcontentloaded", timeout=5) + except PlaywrightTimeoutError: + pass + await page.evaluate(update_image_dimensions_js) + except Exception as e: + self.logger.error( + message="Error updating image dimensions: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + # Process iframes if needed + if config.process_iframes: + page = await self.process_iframes(page) + + # Pre-content retrieval hooks and delay + await self.execute_hook("before_retrieve_html", page, context=context) + if config.delay_before_return_html: + await asyncio.sleep(config.delay_before_return_html) + + # Handle overlay removal + if config.remove_overlay_elements: + await self.remove_overlay_elements(page) + + # Get final HTML content + html = await page.content() + await self.execute_hook("before_return_html", page = page, html = html, context=context) + + # Handle PDF and screenshot generation + start_export_time = time.perf_counter() + pdf_data = None + screenshot_data = None + + if config.pdf: + pdf_data = await self.export_pdf(page) + + if config.screenshot: + if config.screenshot_wait_for: + await asyncio.sleep(config.screenshot_wait_for) + screenshot_data = await self.take_screenshot( + page, screenshot_height_threshold=config.screenshot_height_threshold + ) + + if screenshot_data or pdf_data: + self.logger.info( + message="Exporting PDF and taking screenshot took {duration:.2f}s", + tag="EXPORT", + params={"duration": time.perf_counter() - start_export_time}, + ) + + # Define delayed content getter + async def get_delayed_content(delay: float = 5.0) -> str: + self.logger.info( + message="Waiting for {delay} seconds before retrieving content for {url}", + tag="INFO", + params={"delay": delay, "url": url}, + ) + await asyncio.sleep(delay) + return await page.content() + + # Return complete response + return AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + pdf_data=pdf_data, + get_delayed_content=get_delayed_content, + ssl_certificate=ssl_cert, + downloaded_files=( + self._downloaded_files if self._downloaded_files else None + ), + ) + + except Exception as e: + raise e + + finally: + # If no session_id is given we should close the page + if not config.session_id: + await page.close() + + async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1): + """ + Helper method to handle full page scanning. + + How it works: + 1. Get the viewport height. + 2. Scroll to the bottom of the page. + 3. Get the total height of the page. + 4. Scroll back to the top of the page. + 5. Scroll to the bottom of the page again. + 6. Continue scrolling until the bottom of the page is reached. + + Args: + page (Page): The Playwright page object + scroll_delay (float): The delay between page scrolls + + """ + try: + viewport_height = page.viewport_size.get( + "height", self.browser_config.viewport_height + ) + current_position = viewport_height + + # await page.evaluate(f"window.scrollTo(0, {current_position})") + await self.safe_scroll(page, 0, current_position, delay=scroll_delay) + # await self.csp_scroll_to(page, 0, current_position) + # await asyncio.sleep(scroll_delay) + + # total_height = await page.evaluate("document.documentElement.scrollHeight") + dimensions = await self.get_page_dimensions(page) + total_height = dimensions['height'] + + while current_position < total_height: + current_position = min(current_position + viewport_height, total_height) + await self.safe_scroll(page, 0, current_position, delay=scroll_delay) + # await page.evaluate(f"window.scrollTo(0, {current_position})") + # await asyncio.sleep(scroll_delay) + + # new_height = await page.evaluate("document.documentElement.scrollHeight") + dimensions = await self.get_page_dimensions(page) + new_height = dimensions['height'] + + if new_height > total_height: + total_height = new_height + + # await page.evaluate("window.scrollTo(0, 0)") + await self.safe_scroll(page, 0, 0) + + except Exception as e: + self.logger.warning( + message="Failed to perform full page scan: {error}", + tag="PAGE_SCAN", + params={"error": str(e)}, + ) + else: + # await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await self.safe_scroll(page, 0, total_height) + + async def _handle_download(self, download): + """ + Handle file downloads. + + How it works: + 1. Get the suggested filename. + 2. Get the download path. + 3. Log the download. + 4. Start the download. + 5. Save the downloaded file. + 6. Log the completion. + + Args: + download (Download): The Playwright download object + + Returns: + None + """ + try: + suggested_filename = download.suggested_filename + download_path = os.path.join(self.downloads_path, suggested_filename) + + self.logger.info( + message="Downloading {filename} to {path}", + tag="FETCH", + params={"filename": suggested_filename, "path": download_path}, + ) + + start_time = time.perf_counter() + await download.save_as(download_path) + end_time = time.perf_counter() + self._downloaded_files.append(download_path) + + self.logger.success( + message="Downloaded {filename} successfully", + tag="COMPLETE", + params={ + "filename": suggested_filename, + "path": download_path, + "duration": f"{end_time - start_time:.2f}s", + }, + ) + except Exception as e: + self.logger.error( + message="Failed to handle download: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + async def remove_overlay_elements(self, page: Page) -> None: + """ + Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. + + Args: + page (Page): The Playwright page instance + """ + remove_overlays_js = load_js_script("remove_overlay_elements") + + try: + await page.evaluate(f""" + (() => {{ + try {{ + {remove_overlays_js} + return {{ success: true }}; + }} catch (error) {{ + return {{ + success: false, + error: error.toString(), + stack: error.stack + }}; + }} + }})() + """) + await page.wait_for_timeout(500) # Wait for any animations to complete + except Exception as e: + self.logger.warning( + message="Failed to remove overlay elements: {error}", + tag="SCRAPE", + params={"error": str(e)}, + ) + + async def export_pdf(self, page: Page) -> bytes: + """ + Exports the current page as a PDF. + + Args: + page (Page): The Playwright page object + + Returns: + bytes: The PDF data + """ + pdf_data = await page.pdf(print_background=True) + return pdf_data + + async def take_screenshot(self, page, **kwargs) -> str: + """ + Take a screenshot of the current page. + + Args: + page (Page): The Playwright page object + kwargs: Additional keyword arguments + + Returns: + str: The base64-encoded screenshot data + """ + need_scroll = await self.page_need_scroll(page) + + if not need_scroll: + # Page is short enough, just take a screenshot + return await self.take_screenshot_naive(page) + else: + # Page is too long, try to take a full-page screenshot + return await self.take_screenshot_scroller(page, **kwargs) + # return await self.take_screenshot_from_pdf(await self.export_pdf(page)) + + async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str: + """ + Convert the first page of the PDF to a screenshot. + + Requires pdf2image and poppler. + + Args: + pdf_data (bytes): The PDF data + + Returns: + str: The base64-encoded screenshot data + """ + try: + from pdf2image import convert_from_bytes + + images = convert_from_bytes(pdf_data) + final_img = images[0].convert("RGB") + buffered = BytesIO() + final_img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + except Exception as e: + error_message = f"Failed to take PDF-based screenshot: {str(e)}" + self.logger.error( + message="PDF Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message}, + ) + # Return error image as fallback + img = Image.new("RGB", (800, 600), color="black") + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + + async def take_screenshot_scroller(self, page: Page, **kwargs) -> str: + """ + Attempt to set a large viewport and take a full-page screenshot. + If still too large, segment the page as before. + + Requires pdf2image and poppler. + + Args: + page (Page): The Playwright page object + kwargs: Additional keyword arguments + + Returns: + str: The base64-encoded screenshot data + """ + try: + # Get page height + dimensions = await self.get_page_dimensions(page) + page_width = dimensions['width'] + page_height = dimensions['height'] + # page_height = await page.evaluate("document.documentElement.scrollHeight") + # page_width = await page.evaluate("document.documentElement.scrollWidth") + + # Set a large viewport + large_viewport_height = min( + page_height, + kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD), + ) + await page.set_viewport_size( + {"width": page_width, "height": large_viewport_height} + ) + + # Page still too long, segment approach + segments = [] + viewport_size = page.viewport_size + viewport_height = viewport_size["height"] + + num_segments = (page_height // viewport_height) + 1 + for i in range(num_segments): + y_offset = i * viewport_height + await page.evaluate(f"window.scrollTo(0, {y_offset})") + await asyncio.sleep(0.01) # wait for render + seg_shot = await page.screenshot(full_page=False) + img = Image.open(BytesIO(seg_shot)).convert("RGB") + segments.append(img) + + total_height = sum(img.height for img in segments) + stitched = Image.new("RGB", (segments[0].width, total_height)) + offset = 0 + for img in segments: + # stitched.paste(img, (0, offset)) + stitched.paste(img.convert("RGB"), (0, offset)) + offset += img.height + + buffered = BytesIO() + stitched = stitched.convert("RGB") + stitched.save(buffered, format="BMP", quality=85) + encoded = base64.b64encode(buffered.getvalue()).decode("utf-8") + + return encoded + except Exception as e: + error_message = f"Failed to take large viewport screenshot: {str(e)}" + self.logger.error( + message="Large viewport screenshot failed: {error}", + tag="ERROR", + params={"error": error_message}, + ) + # return error image + img = Image.new("RGB", (800, 600), color="black") + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + finally: + await page.close() + + async def take_screenshot_naive(self, page: Page) -> str: + """ + Takes a screenshot of the current page. + + Args: + page (Page): The Playwright page instance + + Returns: + str: Base64-encoded screenshot image + """ + try: + # The page is already loaded, just take the screenshot + screenshot = await page.screenshot(full_page=False) + return base64.b64encode(screenshot).decode("utf-8") + except Exception as e: + error_message = f"Failed to take screenshot: {str(e)}" + self.logger.error( + message="Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message}, + ) + + # Generate an error image + img = Image.new("RGB", (800, 600), color="black") + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + finally: + await page.close() + + async def export_storage_state(self, path: str = None) -> dict: + """ + Exports the current storage state (cookies, localStorage, sessionStorage) + to a JSON file at the specified path. + + Args: + path (str): The path to save the storage state JSON file + + Returns: + dict: The exported storage state + """ + if self.default_context: + state = await self.default_context.storage_state(path=path) + self.logger.info( + message="Exported storage state to {path}", + tag="INFO", + params={"path": path}, + ) + return state + else: + self.logger.warning( + message="No default_context available to export storage state.", + tag="WARNING", + ) + + async def robust_execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]: + """ + Executes user-provided JavaScript code with proper error handling and context, + supporting both synchronous and async user code, plus navigations. + + How it works: + 1. Wait for load state 'domcontentloaded' + 2. If js_code is a string, execute it directly + 3. If js_code is a list, execute each element in sequence + 4. Wait for load state 'networkidle' + 5. Return results + + Args: + page (Page): The Playwright page instance + js_code (Union[str, List[str]]): The JavaScript code to execute + + Returns: + Dict[str, Any]: The results of the execution + """ + try: + await page.wait_for_load_state('domcontentloaded') + + if isinstance(js_code, str): + scripts = [js_code] + else: + scripts = js_code + + results = [] + for script in scripts: + try: + # Attempt the evaluate + # If the user code triggers navigation, we catch the "context destroyed" error + # then wait for the new page to load before continuing + result = None + try: + result = await page.evaluate(f""" + (async () => {{ + try {{ + {script} + return {{ success: true }}; + }} catch (err) {{ + return {{ success: false, error: err.toString(), stack: err.stack }}; + }} + }})(); + """) + except Error as e: + # If it's due to navigation destroying the context, handle gracefully + if "Execution context was destroyed" in str(e): + self.logger.info("Navigation triggered by script, waiting for load state", tag="JS_EXEC") + try: + await page.wait_for_load_state('load', timeout=30000) + except Error as nav_err: + self.logger.warning( + message="Navigation wait failed: {error}", + tag="JS_EXEC", + params={"error": str(nav_err)} + ) + try: + await page.wait_for_load_state('networkidle', timeout=30000) + except Error as nav_err: + self.logger.warning( + message="Network idle wait failed: {error}", + tag="JS_EXEC", + params={"error": str(nav_err)} + ) + # Return partial success, or adapt as you see fit + result = { + "success": True, + "info": "Navigation triggered, ignoring context destroyed error" + } + else: + # It's some other error, log and continue + self.logger.error( + message="Playwright execution error: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + result = {"success": False, "error": str(e)} + + # If we made it this far with no repeated error, do post-load waits + t1 = time.time() + try: + await page.wait_for_load_state('domcontentloaded', timeout=5000) + print("DOM content loaded after script execution in", time.time() - t1) + except Error as e: + self.logger.warning( + message="DOM content load timeout: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + + # t1 = time.time() + # try: + # await page.wait_for_load_state('networkidle', timeout=5000) + # print("Network idle after script execution in", time.time() - t1) + # except Error as e: + # self.logger.warning( + # message="Network idle timeout: {error}", + # tag="JS_EXEC", + # params={"error": str(e)} + # ) + + results.append(result if result else {"success": True}) + + except Exception as e: + # Catch anything else + self.logger.error( + message="Script chunk failed: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + results.append({"success": False, "error": str(e)}) + + return {"success": True, "results": results} + + except Exception as e: + self.logger.error( + message="Script execution failed: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + return {"success": False, "error": str(e)} + + async def execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]: + """ + Executes user-provided JavaScript code with proper error handling and context. + + Args: + page: Playwright page object + js_code: Single JavaScript string or list of JavaScript code strings + + Returns: + Dict containing execution status and results/errors + """ + try: + # Ensure the page is ready for script execution + await page.wait_for_load_state('domcontentloaded') + + # Handle single script or multiple scripts + if isinstance(js_code, str): + scripts = [js_code] + else: + scripts = js_code + + results = [] + for script in scripts: + try: + # Execute the script and wait for network idle + result = await page.evaluate(f""" + (() => {{ + return new Promise((resolve) => {{ + try {{ + const result = (function() {{ + {script} + }})(); + + // If result is a promise, wait for it + if (result instanceof Promise) {{ + result.then(() => {{ + // Wait a bit for any triggered effects + setTimeout(() => resolve({{ success: true }}), 100); + }}).catch(error => {{ + resolve({{ + success: false, + error: error.toString(), + stack: error.stack + }}); + }}); + }} else {{ + // For non-promise results, still wait a bit for effects + setTimeout(() => resolve({{ success: true }}), 100); + }} + }} catch (error) {{ + resolve({{ + success: false, + error: error.toString(), + stack: error.stack + }}); + }} + }}); + }})() + """) + + # Wait for network idle after script execution + t1 = time.time() + await page.wait_for_load_state('domcontentloaded', timeout=5000) + print("DOM content loaded after script execution in", time.time() - t1) + + t1 = time.time() + await page.wait_for_load_state('networkidle', timeout=5000) + print("Network idle after script execution in", time.time() - t1) + + results.append(result if result else {"success": True}) + + except Error as e: + # Handle Playwright-specific errors + self.logger.error( + message="Playwright execution error: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + results.append({"success": False, "error": str(e)}) + + return {"success": True, "results": results} + + except Exception as e: + self.logger.error( + message="Script execution failed: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + return {"success": False, "error": str(e)} + + except Exception as e: + self.logger.error( + message="Script execution failed: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + return {"success": False, "error": str(e)} + + async def check_visibility(self, page): + """ + Checks if an element is visible on the page. + + Args: + page: Playwright page object + + Returns: + Boolean indicating visibility + """ + return await page.evaluate(""" + () => { + const element = document.body; + if (!element) return false; + const style = window.getComputedStyle(element); + const isVisible = style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + return isVisible; + } + """) + + async def safe_scroll(self, page: Page, x: int, y: int, delay: float = 0.1): + """ + Safely scroll the page with rendering time. + + Args: + page: Playwright page object + x: Horizontal scroll position + y: Vertical scroll position + """ + result = await self.csp_scroll_to(page, x, y) + if result['success']: + await page.wait_for_timeout(delay * 1000) + return result + + async def csp_scroll_to(self, page: Page, x: int, y: int) -> Dict[str, Any]: + """ + Performs a CSP-compliant scroll operation and returns the result status. + + Args: + page: Playwright page object + x: Horizontal scroll position + y: Vertical scroll position + + Returns: + Dict containing scroll status and position information + """ + try: + result = await page.evaluate( + f"""() => {{ + try {{ + const startX = window.scrollX; + const startY = window.scrollY; + window.scrollTo({x}, {y}); + + // Get final position after scroll + const endX = window.scrollX; + const endY = window.scrollY; + + return {{ + success: true, + startPosition: {{ x: startX, y: startY }}, + endPosition: {{ x: endX, y: endY }}, + targetPosition: {{ x: {x}, y: {y} }}, + delta: {{ + x: Math.abs(endX - {x}), + y: Math.abs(endY - {y}) + }} + }}; + }} catch (e) {{ + return {{ + success: false, + error: e.toString() + }}; + }} + }}""" + ) + + if not result['success']: + self.logger.warning( + message="Scroll operation failed: {error}", + tag="SCROLL", + params={"error": result.get('error')} + ) + + return result + + except Exception as e: + self.logger.error( + message="Failed to execute scroll: {error}", + tag="SCROLL", + params={"error": str(e)} + ) + return { + "success": False, + "error": str(e) + } + + async def get_page_dimensions(self, page: Page): + """ + Get the dimensions of the page. + + Args: + page: Playwright page object + + Returns: + Dict containing width and height of the page + """ + return await page.evaluate(""" + () => { + const {scrollWidth, scrollHeight} = document.documentElement; + return {width: scrollWidth, height: scrollHeight}; + } + """) + + async def page_need_scroll(self, page: Page) -> bool: + """ + Determine whether the page need to scroll + + Args: + page: Playwright page object + + Returns: + bool: True if page needs scrolling + """ + try: + need_scroll = await page.evaluate(""" + () => { + const scrollHeight = document.documentElement.scrollHeight; + const viewportHeight = window.innerHeight; + return scrollHeight > viewportHeight; + } + """) + return need_scroll + except Exception as e: + self.logger.warning( + message="Failed to check scroll need: {error}. Defaulting to True for safety.", + tag="SCROLL", + params={"error": str(e)} + ) + return True # Default to scrolling if check fails \ No newline at end of file diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py new file mode 100644 index 0000000000000000000000000000000000000000..aed9c76b0eb47772255b068f7966eb9b419626c8 --- /dev/null +++ b/crawl4ai/async_database.py @@ -0,0 +1,495 @@ +import os, sys +from pathlib import Path +import aiosqlite +import asyncio +from typing import Optional, Tuple, Dict +from contextlib import asynccontextmanager +import logging +import json # Added for serialization/deserialization +from .utils import ensure_content_dirs, generate_content_hash +from .models import CrawlResult, MarkdownGenerationResult +import xxhash +import aiofiles +from .config import NEED_MIGRATION +from .version_manager import VersionManager +from .async_logger import AsyncLogger +from .utils import get_error_context, create_box_message +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +base_directory = DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") +os.makedirs(DB_PATH, exist_ok=True) +DB_PATH = os.path.join(base_directory, "crawl4ai.db") + +class AsyncDatabaseManager: + def __init__(self, pool_size: int = 10, max_retries: int = 3): + self.db_path = DB_PATH + self.content_paths = ensure_content_dirs(os.path.dirname(DB_PATH)) + self.pool_size = pool_size + self.max_retries = max_retries + self.connection_pool: Dict[int, aiosqlite.Connection] = {} + self.pool_lock = asyncio.Lock() + self.init_lock = asyncio.Lock() + self.connection_semaphore = asyncio.Semaphore(pool_size) + self._initialized = False + self.version_manager = VersionManager() + self.logger = AsyncLogger( + log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"), + verbose=False, + tag_width=10 + ) + + + async def initialize(self): + """Initialize the database and connection pool""" + try: + self.logger.info("Initializing database", tag="INIT") + # Ensure the database file exists + os.makedirs(os.path.dirname(self.db_path), exist_ok=True) + + # Check if version update is needed + needs_update = self.version_manager.needs_update() + + # Always ensure base table exists + await self.ainit_db() + + # Verify the table exists + async with aiosqlite.connect(self.db_path, timeout=30.0) as db: + async with db.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'" + ) as cursor: + result = await cursor.fetchone() + if not result: + raise Exception("crawled_data table was not created") + + # If version changed or fresh install, run updates + if needs_update: + self.logger.info("New version detected, running updates", tag="INIT") + await self.update_db_schema() + from .migrations import run_migration # Import here to avoid circular imports + await run_migration() + self.version_manager.update_version() # Update stored version after successful migration + self.logger.success("Version update completed successfully", tag="COMPLETE") + else: + self.logger.success("Database initialization completed successfully", tag="COMPLETE") + + + except Exception as e: + self.logger.error( + message="Database initialization error: {error}", + tag="ERROR", + params={"error": str(e)} + ) + self.logger.info( + message="Database will be initialized on first use", + tag="INIT" + ) + + raise + + + async def cleanup(self): + """Cleanup connections when shutting down""" + async with self.pool_lock: + for conn in self.connection_pool.values(): + await conn.close() + self.connection_pool.clear() + + @asynccontextmanager + async def get_connection(self): + """Connection pool manager with enhanced error handling""" + if not self._initialized: + async with self.init_lock: + if not self._initialized: + try: + await self.initialize() + self._initialized = True + except Exception as e: + import sys + error_context = get_error_context(sys.exc_info()) + self.logger.error( + message="Database initialization failed:\n{error}\n\nContext:\n{context}\n\nTraceback:\n{traceback}", + tag="ERROR", + force_verbose=True, + params={ + "error": str(e), + "context": error_context["code_context"], + "traceback": error_context["full_traceback"] + } + ) + raise + + await self.connection_semaphore.acquire() + task_id = id(asyncio.current_task()) + + try: + async with self.pool_lock: + if task_id not in self.connection_pool: + try: + conn = await aiosqlite.connect( + self.db_path, + timeout=30.0 + ) + await conn.execute('PRAGMA journal_mode = WAL') + await conn.execute('PRAGMA busy_timeout = 5000') + + # Verify database structure + async with conn.execute("PRAGMA table_info(crawled_data)") as cursor: + columns = await cursor.fetchall() + column_names = [col[1] for col in columns] + expected_columns = { + 'url', 'html', 'cleaned_html', 'markdown', 'extracted_content', + 'success', 'media', 'links', 'metadata', 'screenshot', + 'response_headers', 'downloaded_files' + } + missing_columns = expected_columns - set(column_names) + if missing_columns: + raise ValueError(f"Database missing columns: {missing_columns}") + + self.connection_pool[task_id] = conn + except Exception as e: + import sys + error_context = get_error_context(sys.exc_info()) + error_message = ( + f"Unexpected error in db get_connection at line {error_context['line_no']} " + f"in {error_context['function']} ({error_context['filename']}):\n" + f"Error: {str(e)}\n\n" + f"Code context:\n{error_context['code_context']}" + ) + self.logger.error( + message=create_box_message(error_message, type= "error"), + ) + + raise + + yield self.connection_pool[task_id] + + except Exception as e: + import sys + error_context = get_error_context(sys.exc_info()) + error_message = ( + f"Unexpected error in db get_connection at line {error_context['line_no']} " + f"in {error_context['function']} ({error_context['filename']}):\n" + f"Error: {str(e)}\n\n" + f"Code context:\n{error_context['code_context']}" + ) + self.logger.error( + message=create_box_message(error_message, type= "error"), + ) + raise + finally: + async with self.pool_lock: + if task_id in self.connection_pool: + await self.connection_pool[task_id].close() + del self.connection_pool[task_id] + self.connection_semaphore.release() + + + async def execute_with_retry(self, operation, *args): + """Execute database operations with retry logic""" + for attempt in range(self.max_retries): + try: + async with self.get_connection() as db: + result = await operation(db, *args) + await db.commit() + return result + except Exception as e: + if attempt == self.max_retries - 1: + self.logger.error( + message="Operation failed after {retries} attempts: {error}", + tag="ERROR", + force_verbose=True, + params={ + "retries": self.max_retries, + "error": str(e) + } + ) + raise + await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff + + async def ainit_db(self): + """Initialize database schema""" + async with aiosqlite.connect(self.db_path, timeout=30.0) as db: + await db.execute(''' + CREATE TABLE IF NOT EXISTS crawled_data ( + url TEXT PRIMARY KEY, + html TEXT, + cleaned_html TEXT, + markdown TEXT, + extracted_content TEXT, + success BOOLEAN, + media TEXT DEFAULT "{}", + links TEXT DEFAULT "{}", + metadata TEXT DEFAULT "{}", + screenshot TEXT DEFAULT "", + response_headers TEXT DEFAULT "{}", + downloaded_files TEXT DEFAULT "{}" -- New column added + ) + ''') + await db.commit() + + + + async def update_db_schema(self): + """Update database schema if needed""" + async with aiosqlite.connect(self.db_path, timeout=30.0) as db: + cursor = await db.execute("PRAGMA table_info(crawled_data)") + columns = await cursor.fetchall() + column_names = [column[1] for column in columns] + + # List of new columns to add + new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files'] + + for column in new_columns: + if column not in column_names: + await self.aalter_db_add_column(column, db) + await db.commit() + + async def aalter_db_add_column(self, new_column: str, db): + """Add new column to the database""" + if new_column == 'response_headers': + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"') + else: + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') + self.logger.info( + message="Added column '{column}' to the database", + tag="INIT", + params={"column": new_column} + ) + + + async def aget_cached_url(self, url: str) -> Optional[CrawlResult]: + """Retrieve cached URL data as CrawlResult""" + async def _get(db): + async with db.execute( + 'SELECT * FROM crawled_data WHERE url = ?', (url,) + ) as cursor: + row = await cursor.fetchone() + if not row: + return None + + # Get column names + columns = [description[0] for description in cursor.description] + # Create dict from row data + row_dict = dict(zip(columns, row)) + + # Load content from files using stored hashes + content_fields = { + 'html': row_dict['html'], + 'cleaned_html': row_dict['cleaned_html'], + 'markdown': row_dict['markdown'], + 'extracted_content': row_dict['extracted_content'], + 'screenshot': row_dict['screenshot'], + 'screenshots': row_dict['screenshot'], + } + + for field, hash_value in content_fields.items(): + if hash_value: + content = await self._load_content( + hash_value, + field.split('_')[0] # Get content type from field name + ) + row_dict[field] = content or "" + else: + row_dict[field] = "" + + # Parse JSON fields + json_fields = ['media', 'links', 'metadata', 'response_headers', 'markdown'] + for field in json_fields: + try: + row_dict[field] = json.loads(row_dict[field]) if row_dict[field] else {} + except json.JSONDecodeError: + row_dict[field] = {} + + if isinstance(row_dict['markdown'], Dict): + row_dict['markdown_v2'] = row_dict['markdown'] + if row_dict['markdown'].get('raw_markdown'): + row_dict['markdown'] = row_dict['markdown']['raw_markdown'] + + # Parse downloaded_files + try: + row_dict['downloaded_files'] = json.loads(row_dict['downloaded_files']) if row_dict['downloaded_files'] else [] + except json.JSONDecodeError: + row_dict['downloaded_files'] = [] + + # Remove any fields not in CrawlResult model + valid_fields = CrawlResult.__annotations__.keys() + filtered_dict = {k: v for k, v in row_dict.items() if k in valid_fields} + + return CrawlResult(**filtered_dict) + + try: + return await self.execute_with_retry(_get) + except Exception as e: + self.logger.error( + message="Error retrieving cached URL: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) + return None + + async def acache_url(self, result: CrawlResult): + """Cache CrawlResult data""" + # Store content files and get hashes + content_map = { + 'html': (result.html, 'html'), + 'cleaned_html': (result.cleaned_html or "", 'cleaned'), + 'markdown': None, + 'extracted_content': (result.extracted_content or "", 'extracted'), + 'screenshot': (result.screenshot or "", 'screenshots') + } + + try: + if isinstance(result.markdown, MarkdownGenerationResult): + content_map['markdown'] = (result.markdown.model_dump_json(), 'markdown') + elif hasattr(result, 'markdown_v2'): + content_map['markdown'] = (result.markdown_v2.model_dump_json(), 'markdown') + elif isinstance(result.markdown, str): + markdown_result = MarkdownGenerationResult(raw_markdown=result.markdown) + content_map['markdown'] = (markdown_result.model_dump_json(), 'markdown') + else: + content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown') + except Exception as e: + self.logger.warning( + message=f"Error processing markdown content: {str(e)}", + tag="WARNING" + ) + # Fallback to empty markdown result + content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown') + + content_hashes = {} + for field, (content, content_type) in content_map.items(): + content_hashes[field] = await self._store_content(content, content_type) + + async def _cache(db): + await db.execute(''' + INSERT INTO crawled_data ( + url, html, cleaned_html, markdown, + extracted_content, success, media, links, metadata, + screenshot, response_headers, downloaded_files + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(url) DO UPDATE SET + html = excluded.html, + cleaned_html = excluded.cleaned_html, + markdown = excluded.markdown, + extracted_content = excluded.extracted_content, + success = excluded.success, + media = excluded.media, + links = excluded.links, + metadata = excluded.metadata, + screenshot = excluded.screenshot, + response_headers = excluded.response_headers, + downloaded_files = excluded.downloaded_files + ''', ( + result.url, + content_hashes['html'], + content_hashes['cleaned_html'], + content_hashes['markdown'], + content_hashes['extracted_content'], + result.success, + json.dumps(result.media), + json.dumps(result.links), + json.dumps(result.metadata or {}), + content_hashes['screenshot'], + json.dumps(result.response_headers or {}), + json.dumps(result.downloaded_files or []) + )) + + try: + await self.execute_with_retry(_cache) + except Exception as e: + self.logger.error( + message="Error caching URL: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) + + + async def aget_total_count(self) -> int: + """Get total number of cached URLs""" + async def _count(db): + async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor: + result = await cursor.fetchone() + return result[0] if result else 0 + + try: + return await self.execute_with_retry(_count) + except Exception as e: + self.logger.error( + message="Error getting total count: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) + return 0 + + async def aclear_db(self): + """Clear all data from the database""" + async def _clear(db): + await db.execute('DELETE FROM crawled_data') + + try: + await self.execute_with_retry(_clear) + except Exception as e: + self.logger.error( + message="Error clearing database: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) + + async def aflush_db(self): + """Drop the entire table""" + async def _flush(db): + await db.execute('DROP TABLE IF EXISTS crawled_data') + + try: + await self.execute_with_retry(_flush) + except Exception as e: + self.logger.error( + message="Error flushing database: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) + + + async def _store_content(self, content: str, content_type: str) -> str: + """Store content in filesystem and return hash""" + if not content: + return "" + + content_hash = generate_content_hash(content) + file_path = os.path.join(self.content_paths[content_type], content_hash) + + # Only write if file doesn't exist + if not os.path.exists(file_path): + async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: + await f.write(content) + + return content_hash + + async def _load_content(self, content_hash: str, content_type: str) -> Optional[str]: + """Load content from filesystem by hash""" + if not content_hash: + return None + + file_path = os.path.join(self.content_paths[content_type], content_hash) + try: + async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: + return await f.read() + except: + self.logger.error( + message="Failed to load content: {file_path}", + tag="ERROR", + force_verbose=True, + params={"file_path": file_path} + ) + return None + +# Create a singleton instance +async_db_manager = AsyncDatabaseManager() diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..5d2d54b5e10314b74f4b648ad7495cb24cf6fbf0 --- /dev/null +++ b/crawl4ai/async_logger.py @@ -0,0 +1,231 @@ +from enum import Enum +from typing import Optional, Dict, Any, Union +from colorama import Fore, Back, Style, init +import time +import os +from datetime import datetime + +class LogLevel(Enum): + DEBUG = 1 + INFO = 2 + SUCCESS = 3 + WARNING = 4 + ERROR = 5 + +class AsyncLogger: + """ + Asynchronous logger with support for colored console output and file logging. + Supports templated messages with colored components. + """ + + DEFAULT_ICONS = { + 'INIT': '→', + 'READY': '✓', + 'FETCH': '↓', + 'SCRAPE': '◆', + 'EXTRACT': '■', + 'COMPLETE': '●', + 'ERROR': '×', + 'DEBUG': '⋯', + 'INFO': 'ℹ', + 'WARNING': '⚠', + } + + DEFAULT_COLORS = { + LogLevel.DEBUG: Fore.LIGHTBLACK_EX, + LogLevel.INFO: Fore.CYAN, + LogLevel.SUCCESS: Fore.GREEN, + LogLevel.WARNING: Fore.YELLOW, + LogLevel.ERROR: Fore.RED, + } + + def __init__( + self, + log_file: Optional[str] = None, + log_level: LogLevel = LogLevel.DEBUG, + tag_width: int = 10, + icons: Optional[Dict[str, str]] = None, + colors: Optional[Dict[LogLevel, str]] = None, + verbose: bool = True + ): + """ + Initialize the logger. + + Args: + log_file: Optional file path for logging + log_level: Minimum log level to display + tag_width: Width for tag formatting + icons: Custom icons for different tags + colors: Custom colors for different log levels + verbose: Whether to output to console + """ + init() # Initialize colorama + self.log_file = log_file + self.log_level = log_level + self.tag_width = tag_width + self.icons = icons or self.DEFAULT_ICONS + self.colors = colors or self.DEFAULT_COLORS + self.verbose = verbose + + # Create log file directory if needed + if log_file: + os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True) + + def _format_tag(self, tag: str) -> str: + """Format a tag with consistent width.""" + return f"[{tag}]".ljust(self.tag_width, ".") + + def _get_icon(self, tag: str) -> str: + """Get the icon for a tag, defaulting to info icon if not found.""" + return self.icons.get(tag, self.icons['INFO']) + + def _write_to_file(self, message: str): + """Write a message to the log file if configured.""" + if self.log_file: + timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + with open(self.log_file, 'a', encoding='utf-8') as f: + # Strip ANSI color codes for file output + clean_message = message.replace(Fore.RESET, '').replace(Style.RESET_ALL, '') + for color in vars(Fore).values(): + if isinstance(color, str): + clean_message = clean_message.replace(color, '') + f.write(f"[{timestamp}] {clean_message}\n") + + def _log( + self, + level: LogLevel, + message: str, + tag: str, + params: Optional[Dict[str, Any]] = None, + colors: Optional[Dict[str, str]] = None, + base_color: Optional[str] = None, + **kwargs + ): + """ + Core logging method that handles message formatting and output. + + Args: + level: Log level for this message + message: Message template string + tag: Tag for the message + params: Parameters to format into the message + colors: Color overrides for specific parameters + base_color: Base color for the entire message + """ + if level.value < self.log_level.value: + return + + # Format the message with parameters if provided + if params: + try: + # First format the message with raw parameters + formatted_message = message.format(**params) + + # Then apply colors if specified + if colors: + for key, color in colors.items(): + # Find the formatted value in the message and wrap it with color + if key in params: + value_str = str(params[key]) + formatted_message = formatted_message.replace( + value_str, + f"{color}{value_str}{Style.RESET_ALL}" + ) + + except KeyError as e: + formatted_message = f"LOGGING ERROR: Missing parameter {e} in message template" + level = LogLevel.ERROR + else: + formatted_message = message + + # Construct the full log line + color = base_color or self.colors[level] + log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}" + + # Output to console if verbose + if self.verbose or kwargs.get("force_verbose", False): + print(log_line) + + # Write to file if configured + self._write_to_file(log_line) + + def debug(self, message: str, tag: str = "DEBUG", **kwargs): + """Log a debug message.""" + self._log(LogLevel.DEBUG, message, tag, **kwargs) + + def info(self, message: str, tag: str = "INFO", **kwargs): + """Log an info message.""" + self._log(LogLevel.INFO, message, tag, **kwargs) + + def success(self, message: str, tag: str = "SUCCESS", **kwargs): + """Log a success message.""" + self._log(LogLevel.SUCCESS, message, tag, **kwargs) + + def warning(self, message: str, tag: str = "WARNING", **kwargs): + """Log a warning message.""" + self._log(LogLevel.WARNING, message, tag, **kwargs) + + def error(self, message: str, tag: str = "ERROR", **kwargs): + """Log an error message.""" + self._log(LogLevel.ERROR, message, tag, **kwargs) + + def url_status( + self, + url: str, + success: bool, + timing: float, + tag: str = "FETCH", + url_length: int = 50 + ): + """ + Convenience method for logging URL fetch status. + + Args: + url: The URL being processed + success: Whether the operation was successful + timing: Time taken for the operation + tag: Tag for the message + url_length: Maximum length for URL in log + """ + self._log( + level=LogLevel.SUCCESS if success else LogLevel.ERROR, + message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s", + tag=tag, + params={ + "url": url, + "url_length": url_length, + "status": success, + "timing": timing + }, + colors={ + "status": Fore.GREEN if success else Fore.RED, + "timing": Fore.YELLOW + } + ) + + def error_status( + self, + url: str, + error: str, + tag: str = "ERROR", + url_length: int = 50 + ): + """ + Convenience method for logging error status. + + Args: + url: The URL being processed + error: Error message + tag: Tag for the message + url_length: Maximum length for URL in log + """ + self._log( + level=LogLevel.ERROR, + message="{url:.{url_length}}... | Error: {error}", + tag=tag, + params={ + "url": url, + "url_length": url_length, + "error": error + } + ) \ No newline at end of file diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py new file mode 100644 index 0000000000000000000000000000000000000000..6ed8ec8f5c093505485728337ca1813442438d0f --- /dev/null +++ b/crawl4ai/async_webcrawler.py @@ -0,0 +1,833 @@ +import os, sys +import time +import warnings +from enum import Enum +from colorama import init, Fore, Back, Style +from pathlib import Path +from typing import Optional, List, Union +import json +import asyncio +# from contextlib import nullcontext, asynccontextmanager +from contextlib import asynccontextmanager +from .models import CrawlResult, MarkdownGenerationResult +from .async_database import async_db_manager +from .chunking_strategy import * +from .content_filter_strategy import * +from .extraction_strategy import * +from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse +from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode +from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy +from .content_scraping_strategy import WebScrapingStrategy +from .async_logger import AsyncLogger +from .async_configs import BrowserConfig, CrawlerRunConfig +from .config import ( + MIN_WORD_THRESHOLD, + IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, + URL_LOG_SHORTEN_LENGTH +) +from .utils import ( + sanitize_input_encode, + InvalidCSSSelectorError, + format_html, + fast_format_html, + create_box_message +) + +from urllib.parse import urlparse +import random +from .__version__ import __version__ as crawl4ai_version + + +class AsyncWebCrawler: + """ + Asynchronous web crawler with flexible caching capabilities. + + There are two ways to use the crawler: + + 1. Using context manager (recommended for simple cases): + ```python + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com") + ``` + + 2. Using explicit lifecycle management (recommended for long-running applications): + ```python + crawler = AsyncWebCrawler() + await crawler.start() + + # Use the crawler multiple times + result1 = await crawler.arun(url="https://example.com") + result2 = await crawler.arun(url="https://another.com") + + await crawler.close() + ``` + + Migration Guide: + Old way (deprecated): + crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True) + + New way (recommended): + browser_config = BrowserConfig(browser_type="chromium", headless=True) + crawler = AsyncWebCrawler(config=browser_config) + + + Attributes: + browser_config (BrowserConfig): Configuration object for browser settings. + crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages. + logger (AsyncLogger): Logger instance for recording events and errors. + always_bypass_cache (bool): Whether to always bypass cache. + crawl4ai_folder (str): Directory for storing cache. + base_directory (str): Base directory for storing cache. + ready (bool): Whether the crawler is ready for use. + + Methods: + start(): Start the crawler explicitly without using context manager. + close(): Close the crawler explicitly without using context manager. + arun(): Run the crawler for a single source: URL (web, local file, or raw HTML). + awarmup(): Perform warmup sequence. + arun_many(): Run the crawler for multiple sources. + aprocess_html(): Process HTML content. + + Typical Usage: + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com") + print(result.markdown) + + Using configuration: + browser_config = BrowserConfig(browser_type="chromium", headless=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS + ) + result = await crawler.arun(url="https://example.com", config=crawler_config) + print(result.markdown) + """ + _domain_last_hit = {} + + def __init__( + self, + crawler_strategy: Optional[AsyncCrawlerStrategy] = None, + config: Optional[BrowserConfig] = None, + always_bypass_cache: bool = False, + always_by_pass_cache: Optional[bool] = None, # Deprecated parameter + base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), + thread_safe: bool = False, + **kwargs, + ): + """ + Initialize the AsyncWebCrawler. + + Args: + crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy + config: Configuration object for browser settings. If None, will be created from kwargs + always_bypass_cache: Whether to always bypass cache (new parameter) + always_by_pass_cache: Deprecated, use always_bypass_cache instead + base_directory: Base directory for storing cache + thread_safe: Whether to use thread-safe operations + **kwargs: Additional arguments for backwards compatibility + """ + # Handle browser configuration + browser_config = config + if browser_config is not None: + if any(k in kwargs for k in ["browser_type", "headless", "viewport_width", "viewport_height"]): + self.logger.warning( + message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.", + tag="WARNING" + ) + else: + # Create browser config from kwargs for backwards compatibility + browser_config = BrowserConfig.from_kwargs(kwargs) + + self.browser_config = browser_config + + # Initialize logger first since other components may need it + self.logger = AsyncLogger( + log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"), + verbose=self.browser_config.verbose, + tag_width=10 + ) + + + # Initialize crawler strategy + params = { + k:v for k, v in kwargs.items() if k in ['browser_congig', 'logger'] + } + self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( + browser_config=browser_config, + logger=self.logger, + **params # Pass remaining kwargs for backwards compatibility + ) + + # If craweler strategy doesnt have logger, use crawler logger + if not self.crawler_strategy.logger: + self.crawler_strategy.logger = self.logger + + # Handle deprecated cache parameter + if always_by_pass_cache is not None: + if kwargs.get("warning", True): + warnings.warn( + "'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. " + "Use 'always_bypass_cache' instead. " + "Pass warning=False to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) + self.always_bypass_cache = always_by_pass_cache + else: + self.always_bypass_cache = always_bypass_cache + + # Thread safety setup + self._lock = asyncio.Lock() if thread_safe else None + + # Initialize directories + self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") + os.makedirs(self.crawl4ai_folder, exist_ok=True) + os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) + + self.ready = False + + async def start(self): + """ + Start the crawler explicitly without using context manager. + This is equivalent to using 'async with' but gives more control over the lifecycle. + + This method will: + 1. Initialize the browser and context + 2. Perform warmup sequence + 3. Return the crawler instance for method chaining + + Returns: + AsyncWebCrawler: The initialized crawler instance + """ + await self.crawler_strategy.__aenter__() + await self.awarmup() + return self + + async def close(self): + """ + Close the crawler explicitly without using context manager. + This should be called when you're done with the crawler if you used start(). + + This method will: + 1. Clean up browser resources + 2. Close any open pages and contexts + """ + await self.crawler_strategy.__aexit__(None, None, None) + + async def __aenter__(self): + return await self.start() + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.close() + + async def awarmup(self): + """ + Initialize the crawler with warm-up sequence. + + This method: + 1. Logs initialization info + 2. Sets up browser configuration + 3. Marks the crawler as ready + """ + self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") + self.ready = True + + @asynccontextmanager + async def nullcontext(self): + """异步空上下文管理器""" + yield + + async def arun( + self, + url: str, + config: Optional[CrawlerRunConfig] = None, + # Legacy parameters maintained for backwards compatibility + word_count_threshold=MIN_WORD_THRESHOLD, + extraction_strategy: ExtractionStrategy = None, + chunking_strategy: ChunkingStrategy = RegexChunking(), + content_filter: RelevantContentFilter = None, + cache_mode: Optional[CacheMode] = None, + # Deprecated cache parameters + bypass_cache: bool = False, + disable_cache: bool = False, + no_cache_read: bool = False, + no_cache_write: bool = False, + # Other legacy parameters + css_selector: str = None, + screenshot: bool = False, + pdf: bool = False, + user_agent: str = None, + verbose=True, + **kwargs, + ) -> CrawlResult: + """ + Runs the crawler for a single source: URL (web, local file, or raw HTML). + + Migration Guide: + Old way (deprecated): + result = await crawler.arun( + url="https://example.com", + word_count_threshold=200, + screenshot=True, + ... + ) + + New way (recommended): + config = CrawlerRunConfig( + word_count_threshold=200, + screenshot=True, + ... + ) + result = await crawler.arun(url="https://example.com", crawler_config=config) + + Args: + url: The URL to crawl (http://, https://, file://, or raw:) + crawler_config: Configuration object controlling crawl behavior + [other parameters maintained for backwards compatibility] + + Returns: + CrawlResult: The result of crawling and processing + """ + crawler_config = config + if not isinstance(url, str) or not url: + raise ValueError("Invalid URL, make sure the URL is a non-empty string") + + async with self._lock or self.nullcontext(): + try: + # Handle configuration + if crawler_config is not None: + # if any(param is not None for param in [ + # word_count_threshold, extraction_strategy, chunking_strategy, + # content_filter, cache_mode, css_selector, screenshot, pdf + # ]): + # self.logger.warning( + # message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", + # tag="WARNING" + # ) + config = crawler_config + else: + # Merge all parameters into a single kwargs dict for config creation + config_kwargs = { + "word_count_threshold": word_count_threshold, + "extraction_strategy": extraction_strategy, + "chunking_strategy": chunking_strategy, + "content_filter": content_filter, + "cache_mode": cache_mode, + "bypass_cache": bypass_cache, + "disable_cache": disable_cache, + "no_cache_read": no_cache_read, + "no_cache_write": no_cache_write, + "css_selector": css_selector, + "screenshot": screenshot, + "pdf": pdf, + "verbose": verbose, + **kwargs + } + config = CrawlerRunConfig.from_kwargs(config_kwargs) + + # Handle deprecated cache parameters + if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): + if kwargs.get("warning", True): + warnings.warn( + "Cache control boolean flags are deprecated and will be removed in version 0.5.0. " + "Use 'cache_mode' parameter instead.", + DeprecationWarning, + stacklevel=2 + ) + + # Convert legacy parameters if cache_mode not provided + if config.cache_mode is None: + config.cache_mode = _legacy_to_cache_mode( + disable_cache=disable_cache, + bypass_cache=bypass_cache, + no_cache_read=no_cache_read, + no_cache_write=no_cache_write + ) + + # Default to ENABLED if no cache mode specified + if config.cache_mode is None: + config.cache_mode = CacheMode.ENABLED + + # Create cache context + cache_context = CacheContext(url, config.cache_mode, self.always_bypass_cache) + + # Initialize processing variables + async_response: AsyncCrawlResponse = None + cached_result: CrawlResult = None + screenshot_data = None + pdf_data = None + extracted_content = None + start_time = time.perf_counter() + + # Try to get cached result if appropriate + if cache_context.should_read(): + cached_result = await async_db_manager.aget_cached_url(url) + + if cached_result: + html = sanitize_input_encode(cached_result.html) + extracted_content = sanitize_input_encode(cached_result.extracted_content or "") + extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content + # If screenshot is requested but its not in cache, then set cache_result to None + screenshot_data = cached_result.screenshot + pdf_data = cached_result.pdf + if config.screenshot and not screenshot or config.pdf and not pdf: + cached_result = None + + self.logger.url_status( + url=cache_context.display_url, + success=bool(html), + timing=time.perf_counter() - start_time, + tag="FETCH" + ) + + # Fetch fresh content if needed + if not cached_result or not html: + t1 = time.perf_counter() + + if user_agent: + self.crawler_strategy.update_user_agent(user_agent) + + # Pass config to crawl method + async_response = await self.crawler_strategy.crawl( + url, + config=config # Pass the entire config object + ) + + html = sanitize_input_encode(async_response.html) + screenshot_data = async_response.screenshot + pdf_data = async_response.pdf_data + + t2 = time.perf_counter() + self.logger.url_status( + url=cache_context.display_url, + success=bool(html), + timing=t2 - t1, + tag="FETCH" + ) + + # Process the HTML content + crawl_result = await self.aprocess_html( + url=url, + html=html, + extracted_content=extracted_content, + config=config, # Pass the config object instead of individual parameters + screenshot=screenshot_data, + pdf_data=pdf_data, + verbose=config.verbose, + is_raw_html = True if url.startswith("raw:") else False, + **kwargs + ) + + crawl_result.status_code = async_response.status_code + crawl_result.response_headers = async_response.response_headers + crawl_result.downloaded_files = async_response.downloaded_files + crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate + + # # Check and set values from async_response to crawl_result + # try: + # for key in vars(async_response): + # if hasattr(crawl_result, key): + # value = getattr(async_response, key, None) + # current_value = getattr(crawl_result, key, None) + # if value is not None and not current_value: + # try: + # setattr(crawl_result, key, value) + # except Exception as e: + # self.logger.warning( + # message=f"Failed to set attribute {key}: {str(e)}", + # tag="WARNING" + # ) + # except Exception as e: + # self.logger.warning( + # message=f"Error copying response attributes: {str(e)}", + # tag="WARNING" + # ) + + crawl_result.success = bool(html) + crawl_result.session_id = getattr(config, 'session_id', None) + + self.logger.success( + message="{url:.50}... | Status: {status} | Total: {timing}", + tag="COMPLETE", + params={ + "url": cache_context.display_url, + "status": crawl_result.success, + "timing": f"{time.perf_counter() - start_time:.2f}s" + }, + colors={ + "status": Fore.GREEN if crawl_result.success else Fore.RED, + "timing": Fore.YELLOW + } + ) + + # Update cache if appropriate + if cache_context.should_write() and not bool(cached_result): + await async_db_manager.acache_url(crawl_result) + + return crawl_result + + else: + self.logger.success( + message="{url:.50}... | Status: {status} | Total: {timing}", + tag="COMPLETE", + params={ + "url": cache_context.display_url, + "status": True, + "timing": f"{time.perf_counter() - start_time:.2f}s" + }, + colors={ + "status": Fore.GREEN, + "timing": Fore.YELLOW + } + ) + + cached_result.success = bool(html) + cached_result.session_id = getattr(config, 'session_id', None) + return cached_result + + except Exception as e: + error_context = get_error_context(sys.exc_info()) + + error_message = ( + f"Unexpected error in _crawl_web at line {error_context['line_no']} " + f"in {error_context['function']} ({error_context['filename']}):\n" + f"Error: {str(e)}\n\n" + f"Code context:\n{error_context['code_context']}" + ) + # if not hasattr(e, "msg"): + # e.msg = str(e) + + self.logger.error_status( + url=url, + error=create_box_message(error_message, type="error"), + tag="ERROR" + ) + + return CrawlResult( + url=url, + html="", + success=False, + error_message=error_message + ) + + async def aprocess_html( + self, + url: str, + html: str, + extracted_content: str, + config: CrawlerRunConfig, + screenshot: str, + pdf_data: str, + verbose: bool, + **kwargs, + ) -> CrawlResult: + """ + Process HTML content using the provided configuration. + + Args: + url: The URL being processed + html: Raw HTML content + extracted_content: Previously extracted content (if any) + config: Configuration object controlling processing behavior + screenshot: Screenshot data (if any) + pdf_data: PDF data (if any) + verbose: Whether to enable verbose logging + **kwargs: Additional parameters for backwards compatibility + + Returns: + CrawlResult: Processed result containing extracted and formatted content + """ + try: + _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" + t1 = time.perf_counter() + + # Initialize scraping strategy + scrapping_strategy = WebScrapingStrategy(logger=self.logger) + + # Process HTML content + params = {k:v for k, v in config.to_dict().items() if k not in ["url"]} + # add keys from kwargs to params that doesn't exist in params + params.update({k:v for k, v in kwargs.items() if k not in params.keys()}) + + result = scrapping_strategy.scrap( + url, + html, + **params, + # word_count_threshold=config.word_count_threshold, + # css_selector=config.css_selector, + # only_text=config.only_text, + # image_description_min_word_threshold=config.image_description_min_word_threshold, + # content_filter=config.content_filter, + # **kwargs + ) + + if result is None: + raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") + + except InvalidCSSSelectorError as e: + raise ValueError(str(e)) + except Exception as e: + raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") + + + + # Extract results + cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) + fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) + fit_html = sanitize_input_encode(result.get("fit_html", "")) + media = result.get("media", []) + links = result.get("links", []) + metadata = result.get("metadata", {}) + + # Markdown Generation + markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator() + + # Uncomment if by default we want to use PruningContentFilter + # if not config.content_filter and not markdown_generator.content_filter: + # markdown_generator.content_filter = PruningContentFilter() + + markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( + cleaned_html=cleaned_html, + base_url=url, + # html2text_options=kwargs.get('html2text', {}) + ) + markdown_v2 = markdown_result + markdown = sanitize_input_encode(markdown_result.raw_markdown) + + # Log processing completion + self.logger.info( + message="Processed {url:.50}... | Time: {timing}ms", + tag="SCRAPE", + params={ + "url": _url, + "timing": int((time.perf_counter() - t1) * 1000) + } + ) + + # Handle content extraction if needed + if (extracted_content is None and + config.extraction_strategy and + config.chunking_strategy and + not isinstance(config.extraction_strategy, NoExtractionStrategy)): + + t1 = time.perf_counter() + + # Choose content based on input_format + content_format = config.extraction_strategy.input_format + if content_format == "fit_markdown" and not markdown_result.fit_markdown: + self.logger.warning( + message="Fit markdown requested but not available. Falling back to raw markdown.", + tag="EXTRACT", + params={"url": _url} + ) + content_format = "markdown" + + content = { + "markdown": markdown, + "html": html, + "fit_markdown": markdown_result.raw_markdown + }.get(content_format, markdown) + + # Use IdentityChunking for HTML input, otherwise use provided chunking strategy + chunking = IdentityChunking() if content_format == "html" else config.chunking_strategy + sections = chunking.chunk(content) + extracted_content = config.extraction_strategy.run(url, sections) + extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) + + # Log extraction completion + self.logger.info( + message="Completed for {url:.50}... | Time: {timing}s", + tag="EXTRACT", + params={ + "url": _url, + "timing": time.perf_counter() - t1 + } + ) + + # Handle screenshot and PDF data + screenshot_data = None if not screenshot else screenshot + pdf_data = None if not pdf_data else pdf_data + + # Apply HTML formatting if requested + if config.prettiify: + cleaned_html = fast_format_html(cleaned_html) + + # Return complete crawl result + return CrawlResult( + url=url, + html=html, + cleaned_html=cleaned_html, + markdown_v2=markdown_v2, + markdown=markdown, + fit_markdown=fit_markdown, + fit_html=fit_html, + media=media, + links=links, + metadata=metadata, + screenshot=screenshot_data, + pdf=pdf_data, + extracted_content=extracted_content, + success=True, + error_message="", + ) + + async def arun_many( + self, + urls: List[str], + config: Optional[CrawlerRunConfig] = None, + # Legacy parameters maintained for backwards compatibility + word_count_threshold=MIN_WORD_THRESHOLD, + extraction_strategy: ExtractionStrategy = None, + chunking_strategy: ChunkingStrategy = RegexChunking(), + content_filter: RelevantContentFilter = None, + cache_mode: Optional[CacheMode] = None, + bypass_cache: bool = False, + css_selector: str = None, + screenshot: bool = False, + pdf: bool = False, + user_agent: str = None, + verbose=True, + **kwargs, + ) -> List[CrawlResult]: + """ + Runs the crawler for multiple URLs concurrently. + + Migration Guide: + Old way (deprecated): + results = await crawler.arun_many( + urls, + word_count_threshold=200, + screenshot=True, + ... + ) + + New way (recommended): + config = CrawlerRunConfig( + word_count_threshold=200, + screenshot=True, + ... + ) + results = await crawler.arun_many(urls, crawler_config=config) + + Args: + urls: List of URLs to crawl + crawler_config: Configuration object controlling crawl behavior for all URLs + [other parameters maintained for backwards compatibility] + + Returns: + List[CrawlResult]: Results for each URL + """ + crawler_config = config + # Handle configuration + if crawler_config is not None: + if any(param is not None for param in [ + word_count_threshold, extraction_strategy, chunking_strategy, + content_filter, cache_mode, css_selector, screenshot, pdf + ]): + self.logger.warning( + message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", + tag="WARNING" + ) + config = crawler_config + else: + # Merge all parameters into a single kwargs dict for config creation + config_kwargs = { + "word_count_threshold": word_count_threshold, + "extraction_strategy": extraction_strategy, + "chunking_strategy": chunking_strategy, + "content_filter": content_filter, + "cache_mode": cache_mode, + "bypass_cache": bypass_cache, + "css_selector": css_selector, + "screenshot": screenshot, + "pdf": pdf, + "verbose": verbose, + **kwargs + } + config = CrawlerRunConfig.from_kwargs(config_kwargs) + + if bypass_cache: + if kwargs.get("warning", True): + warnings.warn( + "'bypass_cache' is deprecated and will be removed in version 0.5.0. " + "Use 'cache_mode=CacheMode.BYPASS' instead. " + "Pass warning=False to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) + if config.cache_mode is None: + config.cache_mode = CacheMode.BYPASS + + semaphore_count = config.semaphore_count or 5 + semaphore = asyncio.Semaphore(semaphore_count) + + async def crawl_with_semaphore(url): + # Handle rate limiting per domain + domain = urlparse(url).netloc + current_time = time.time() + + self.logger.debug( + message="Started task for {url:.50}...", + tag="PARALLEL", + params={"url": url} + ) + + # Get delay settings from config + mean_delay = config.mean_delay + max_range = config.max_range + + # Apply rate limiting + if domain in self._domain_last_hit: + time_since_last = current_time - self._domain_last_hit[domain] + if time_since_last < mean_delay: + delay = mean_delay + random.uniform(0, max_range) + await asyncio.sleep(delay) + + self._domain_last_hit[domain] = current_time + + async with semaphore: + return await self.arun( + url, + crawler_config=config, # Pass the entire config object + user_agent=user_agent # Maintain user_agent override capability + ) + + # Log start of concurrent crawling + self.logger.info( + message="Starting concurrent crawling for {count} URLs...", + tag="INIT", + params={"count": len(urls)} + ) + + # Execute concurrent crawls + start_time = time.perf_counter() + tasks = [crawl_with_semaphore(url) for url in urls] + results = await asyncio.gather(*tasks, return_exceptions=True) + end_time = time.perf_counter() + + # Log completion + self.logger.success( + message="Concurrent crawling completed for {count} URLs | Total time: {timing}", + tag="COMPLETE", + params={ + "count": len(urls), + "timing": f"{end_time - start_time:.2f}s" + }, + colors={ + "timing": Fore.YELLOW + } + ) + + return [result if not isinstance(result, Exception) else str(result) for result in results] + + async def aclear_cache(self): + """Clear the cache database.""" + await async_db_manager.cleanup() + + async def aflush_cache(self): + """Flush the cache database.""" + await async_db_manager.aflush_db() + + async def aget_cache_size(self): + """Get the total number of cached items.""" + return await async_db_manager.aget_total_count() diff --git a/crawl4ai/cache_context.py b/crawl4ai/cache_context.py new file mode 100644 index 0000000000000000000000000000000000000000..588edd6249a8b4975059a679bdd57b7fe1c0ee49 --- /dev/null +++ b/crawl4ai/cache_context.py @@ -0,0 +1,115 @@ +from enum import Enum + + +class CacheMode(Enum): + """ + Defines the caching behavior for web crawling operations. + + Modes: + - ENABLED: Normal caching behavior (read and write) + - DISABLED: No caching at all + - READ_ONLY: Only read from cache, don't write + - WRITE_ONLY: Only write to cache, don't read + - BYPASS: Bypass cache for this operation + """ + ENABLED = "enabled" + DISABLED = "disabled" + READ_ONLY = "read_only" + WRITE_ONLY = "write_only" + BYPASS = "bypass" + + +class CacheContext: + """ + Encapsulates cache-related decisions and URL handling. + + This class centralizes all cache-related logic and URL type checking, + making the caching behavior more predictable and maintainable. + + Attributes: + url (str): The URL being processed. + cache_mode (CacheMode): The cache mode for the current operation. + always_bypass (bool): If True, bypasses caching for this operation. + is_cacheable (bool): True if the URL is cacheable, False otherwise. + is_web_url (bool): True if the URL is a web URL, False otherwise. + is_local_file (bool): True if the URL is a local file, False otherwise. + is_raw_html (bool): True if the URL is raw HTML, False otherwise. + _url_display (str): The display name for the URL (web, local file, or raw HTML). + """ + def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False): + """ + Initializes the CacheContext with the provided URL and cache mode. + + Args: + url (str): The URL being processed. + cache_mode (CacheMode): The cache mode for the current operation. + always_bypass (bool): If True, bypasses caching for this operation. + """ + self.url = url + self.cache_mode = cache_mode + self.always_bypass = always_bypass + self.is_cacheable = url.startswith(('http://', 'https://', 'file://')) + self.is_web_url = url.startswith(('http://', 'https://')) + self.is_local_file = url.startswith("file://") + self.is_raw_html = url.startswith("raw:") + self._url_display = url if not self.is_raw_html else "Raw HTML" + + def should_read(self) -> bool: + """ + Determines if cache should be read based on context. + + How it works: + 1. If always_bypass is True or is_cacheable is False, return False. + 2. If cache_mode is ENABLED or READ_ONLY, return True. + + Returns: + bool: True if cache should be read, False otherwise. + """ + if self.always_bypass or not self.is_cacheable: + return False + return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY] + + def should_write(self) -> bool: + """ + Determines if cache should be written based on context. + + How it works: + 1. If always_bypass is True or is_cacheable is False, return False. + 2. If cache_mode is ENABLED or WRITE_ONLY, return True. + + Returns: + bool: True if cache should be written, False otherwise. + """ + if self.always_bypass or not self.is_cacheable: + return False + return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY] + + @property + def display_url(self) -> str: + """Returns the URL in display format.""" + return self._url_display + + +def _legacy_to_cache_mode( + disable_cache: bool = False, + bypass_cache: bool = False, + no_cache_read: bool = False, + no_cache_write: bool = False +) -> CacheMode: + """ + Converts legacy cache parameters to the new CacheMode enum. + + This is an internal function to help transition from the old boolean flags + to the new CacheMode system. + """ + if disable_cache: + return CacheMode.DISABLED + if bypass_cache: + return CacheMode.BYPASS + if no_cache_read and no_cache_write: + return CacheMode.DISABLED + if no_cache_read: + return CacheMode.WRITE_ONLY + if no_cache_write: + return CacheMode.READ_ONLY + return CacheMode.ENABLED diff --git a/crawl4ai/chunking_strategy.py b/crawl4ai/chunking_strategy.py new file mode 100644 index 0000000000000000000000000000000000000000..7b8c08adbcda7b3e95c0048f3538f60ef351c88d --- /dev/null +++ b/crawl4ai/chunking_strategy.py @@ -0,0 +1,231 @@ +from abc import ABC, abstractmethod +import re +from collections import Counter +import string +from .model_loader import load_nltk_punkt +from .utils import * + +# Define the abstract base class for chunking strategies +class ChunkingStrategy(ABC): + """ + Abstract base class for chunking strategies. + """ + + @abstractmethod + def chunk(self, text: str) -> list: + """ + Abstract method to chunk the given text. + + Args: + text (str): The text to chunk. + + Returns: + list: A list of chunks. + """ + pass + +# Create an identity chunking strategy f(x) = [x] +class IdentityChunking(ChunkingStrategy): + """ + Chunking strategy that returns the input text as a single chunk. + """ + def chunk(self, text: str) -> list: + return [text] + +# Regex-based chunking +class RegexChunking(ChunkingStrategy): + """ + Chunking strategy that splits text based on regular expression patterns. + """ + def __init__(self, patterns=None, **kwargs): + """ + Initialize the RegexChunking object. + + Args: + patterns (list): A list of regular expression patterns to split text. + """ + if patterns is None: + patterns = [r'\n\n'] # Default split pattern + self.patterns = patterns + + def chunk(self, text: str) -> list: + paragraphs = [text] + for pattern in self.patterns: + new_paragraphs = [] + for paragraph in paragraphs: + new_paragraphs.extend(re.split(pattern, paragraph)) + paragraphs = new_paragraphs + return paragraphs + +# NLP-based sentence chunking +class NlpSentenceChunking(ChunkingStrategy): + """ + Chunking strategy that splits text into sentences using NLTK's sentence tokenizer. + """ + def __init__(self, **kwargs): + """ + Initialize the NlpSentenceChunking object. + """ + load_nltk_punkt() + + + def chunk(self, text: str) -> list: + # Improved regex for sentence splitting + # sentence_endings = re.compile( + # r'(? list: + # Use the TextTilingTokenizer to segment the text + segmented_topics = self.tokenizer.tokenize(text) + return segmented_topics + + def extract_keywords(self, text: str) -> list: + # Tokenize and remove stopwords and punctuation + import nltk as nl + tokens = nl.toknize.word_tokenize(text) + tokens = [token.lower() for token in tokens if token not in nl.corpus.stopwords.words('english') and token not in string.punctuation] + + # Calculate frequency distribution + freq_dist = Counter(tokens) + keywords = [word for word, freq in freq_dist.most_common(self.num_keywords)] + return keywords + + def chunk_with_topics(self, text: str) -> list: + # Segment the text into topics + segments = self.chunk(text) + # Extract keywords for each topic segment + segments_with_topics = [(segment, self.extract_keywords(segment)) for segment in segments] + return segments_with_topics + +# Fixed-length word chunks +class FixedLengthWordChunking(ChunkingStrategy): + """ + Chunking strategy that splits text into fixed-length word chunks. + + How it works: + 1. Split the text into words + 2. Create chunks of fixed length + 3. Return the list of chunks + """ + def __init__(self, chunk_size=100, **kwargs): + """ + Initialize the fixed-length word chunking strategy with the given chunk size. + + Args: + chunk_size (int): The size of each chunk in words. + """ + self.chunk_size = chunk_size + + def chunk(self, text: str) -> list: + words = text.split() + return [' '.join(words[i:i + self.chunk_size]) for i in range(0, len(words), self.chunk_size)] + +# Sliding window chunking +class SlidingWindowChunking(ChunkingStrategy): + """ + Chunking strategy that splits text into overlapping word chunks. + + How it works: + 1. Split the text into words + 2. Create chunks of fixed length + 3. Return the list of chunks + """ + def __init__(self, window_size=100, step=50, **kwargs): + """ + Initialize the sliding window chunking strategy with the given window size and + step size. + + Args: + window_size (int): The size of the sliding window in words. + step (int): The step size for sliding the window in words. + """ + self.window_size = window_size + self.step = step + + def chunk(self, text: str) -> list: + words = text.split() + chunks = [] + + if len(words) <= self.window_size: + return [text] + + for i in range(0, len(words) - self.window_size + 1, self.step): + chunk = ' '.join(words[i:i + self.window_size]) + chunks.append(chunk) + + # Handle the last chunk if it doesn't align perfectly + if i + self.window_size < len(words): + chunks.append(' '.join(words[-self.window_size:])) + + return chunks + +class OverlappingWindowChunking(ChunkingStrategy): + """ + Chunking strategy that splits text into overlapping word chunks. + + How it works: + 1. Split the text into words using whitespace + 2. Create chunks of fixed length equal to the window size + 3. Slide the window by the overlap size + 4. Return the list of chunks + """ + def __init__(self, window_size=1000, overlap=100, **kwargs): + """ + Initialize the overlapping window chunking strategy with the given window size and + overlap size. + + Args: + window_size (int): The size of the window in words. + overlap (int): The size of the overlap between consecutive chunks in words. + """ + self.window_size = window_size + self.overlap = overlap + + def chunk(self, text: str) -> list: + words = text.split() + chunks = [] + + if len(words) <= self.window_size: + return [text] + + start = 0 + while start < len(words): + end = start + self.window_size + chunk = ' '.join(words[start:end]) + chunks.append(chunk) + + if end >= len(words): + break + + start = end - self.overlap + + return chunks \ No newline at end of file diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..4a01c1c2a5bd6d293a2e0f99ef8d4fea41551b9b --- /dev/null +++ b/crawl4ai/cli.py @@ -0,0 +1,105 @@ +import click +import sys +import asyncio +from typing import List +from .docs_manager import DocsManager +from .async_logger import AsyncLogger + +logger = AsyncLogger(verbose=True) +docs_manager = DocsManager(logger) + +def print_table(headers: List[str], rows: List[List[str]], padding: int = 2): + """Print formatted table with headers and rows""" + widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)] + border = '+' + '+'.join('-' * (w + 2 * padding) for w in widths) + '+' + + def format_row(row): + return '|' + '|'.join(f"{' ' * padding}{str(cell):<{w}}{' ' * padding}" + for cell, w in zip(row, widths)) + '|' + + click.echo(border) + click.echo(format_row(headers)) + click.echo(border) + for row in rows: + click.echo(format_row(row)) + click.echo(border) + +@click.group() +def cli(): + """Crawl4AI Command Line Interface""" + pass + +@cli.group() +def docs(): + """Documentation operations""" + pass + +@docs.command() +@click.argument('sections', nargs=-1) +@click.option('--mode', type=click.Choice(['extended', 'condensed']), default='extended') +def combine(sections: tuple, mode: str): + """Combine documentation sections""" + try: + asyncio.run(docs_manager.ensure_docs_exist()) + click.echo(docs_manager.generate(sections, mode)) + except Exception as e: + logger.error(str(e), tag="ERROR") + sys.exit(1) + +@docs.command() +@click.argument('query') +@click.option('--top-k', '-k', default=5) +@click.option('--build-index', is_flag=True, help='Build index if missing') +def search(query: str, top_k: int, build_index: bool): + """Search documentation""" + try: + result = docs_manager.search(query, top_k) + if result == "No search index available. Call build_search_index() first.": + if build_index or click.confirm('No search index found. Build it now?'): + asyncio.run(docs_manager.llm_text.generate_index_files()) + result = docs_manager.search(query, top_k) + click.echo(result) + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + sys.exit(1) + +@docs.command() +def update(): + """Update docs from GitHub""" + try: + asyncio.run(docs_manager.fetch_docs()) + click.echo("Documentation updated successfully") + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + sys.exit(1) + +@docs.command() +@click.option('--force-facts', is_flag=True, help='Force regenerate fact files') +@click.option('--clear-cache', is_flag=True, help='Clear BM25 cache') +def index(force_facts: bool, clear_cache: bool): + """Build or rebuild search indexes""" + try: + asyncio.run(docs_manager.ensure_docs_exist()) + asyncio.run(docs_manager.llm_text.generate_index_files( + force_generate_facts=force_facts, + clear_bm25_cache=clear_cache + )) + click.echo("Search indexes built successfully") + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + sys.exit(1) + +# Add docs list command +@docs.command() +def list(): + """List available documentation sections""" + try: + sections = docs_manager.list() + print_table(["Sections"], [[section] for section in sections]) + + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + sys.exit(1) + +if __name__ == '__main__': + cli() \ No newline at end of file diff --git a/crawl4ai/config.py b/crawl4ai/config.py new file mode 100644 index 0000000000000000000000000000000000000000..c2be763810763dad7c62aea734b0def7bd4c21a2 --- /dev/null +++ b/crawl4ai/config.py @@ -0,0 +1,64 @@ +import os +from dotenv import load_dotenv + +load_dotenv() # Load environment variables from .env file + +# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy +DEFAULT_PROVIDER = "openai/gpt-4o-mini" +MODEL_REPO_BRANCH = "new-release-0.0.2" +# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy +PROVIDER_MODELS = { + "ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token + "groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"), + "groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"), + "openai/gpt-4o-mini": os.getenv("OPENAI_API_KEY"), + "openai/gpt-4o": os.getenv("OPENAI_API_KEY"), + "openai/o1-mini": os.getenv("OPENAI_API_KEY"), + "openai/o1-preview": os.getenv("OPENAI_API_KEY"), + "anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"), + "anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"), + "anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"), + "anthropic/claude-3-5-sonnet-20240620": os.getenv("ANTHROPIC_API_KEY"), +} + +# Chunk token threshold +CHUNK_TOKEN_THRESHOLD = 2 ** 11 # 2048 tokens +OVERLAP_RATE = 0.1 +WORD_TOKEN_RATE = 1.3 + +# Threshold for the minimum number of word in a HTML tag to be considered +MIN_WORD_THRESHOLD = 1 +IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1 + +IMPORTANT_ATTRS = ['src', 'href', 'alt', 'title', 'width', 'height'] +ONLY_TEXT_ELIGIBLE_TAGS = ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'] +SOCIAL_MEDIA_DOMAINS = [ + 'facebook.com', + 'twitter.com', + 'x.com', + 'linkedin.com', + 'instagram.com', + 'pinterest.com', + 'tiktok.com', + 'snapchat.com', + 'reddit.com', + ] + +# Threshold for the Image extraction - Range is 1 to 6 +# Images are scored based on point based system, to filter based on usefulness. Points are assigned +# to each image based on the following aspects. +# If either height or width exceeds 150px +# If image size is greater than 10Kb +# If alt property is set +# If image format is in jpg, png or webp +# If image is in the first half of the total images extracted from the page +IMAGE_SCORE_THRESHOLD = 2 + +MAX_METRICS_HISTORY = 1000 + +NEED_MIGRATION = True +URL_LOG_SHORTEN_LENGTH = 30 +SHOW_DEPRECATION_WARNINGS = True +SCREENSHOT_HEIGHT_TRESHOLD = 10000 +PAGE_TIMEOUT=60000 +DOWNLOAD_PAGE_TIMEOUT=60000 \ No newline at end of file diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py new file mode 100644 index 0000000000000000000000000000000000000000..ce4331187ef57477f605222f275fa63eaeccbe6c --- /dev/null +++ b/crawl4ai/content_filter_strategy.py @@ -0,0 +1,627 @@ +import re +from bs4 import BeautifulSoup, Tag +from typing import List, Tuple, Dict +from rank_bm25 import BM25Okapi +from time import perf_counter +from collections import deque +from bs4 import BeautifulSoup, NavigableString, Tag, Comment +from .utils import clean_tokens +from abc import ABC, abstractmethod +import math +from snowballstemmer import stemmer +class RelevantContentFilter(ABC): + """Abstract base class for content filtering strategies""" + def __init__(self, user_query: str = None): + self.user_query = user_query + self.included_tags = { + # Primary structure + 'article', 'main', 'section', 'div', + # List structures + 'ul', 'ol', 'li', 'dl', 'dt', 'dd', + # Text content + 'p', 'span', 'blockquote', 'pre', 'code', + # Headers + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + # Tables + 'table', 'thead', 'tbody', 'tr', 'td', 'th', + # Other semantic elements + 'figure', 'figcaption', 'details', 'summary', + # Text formatting + 'em', 'strong', 'b', 'i', 'mark', 'small', + # Rich content + 'time', 'address', 'cite', 'q' + } + self.excluded_tags = { + 'nav', 'footer', 'header', 'aside', 'script', + 'style', 'form', 'iframe', 'noscript' + } + self.header_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'} + self.negative_patterns = re.compile( + r'nav|footer|header|sidebar|ads|comment|promo|advert|social|share', + re.I + ) + self.min_word_count = 2 + + @abstractmethod + def filter_content(self, html: str) -> List[str]: + """Abstract method to be implemented by specific filtering strategies""" + pass + + def extract_page_query(self, soup: BeautifulSoup, body: Tag) -> str: + """Common method to extract page metadata with fallbacks""" + if self.user_query: + return self.user_query + + query_parts = [] + + # Title + try: + title = soup.title.string + if title: + query_parts.append(title) + except Exception: + pass + + if soup.find('h1'): + query_parts.append(soup.find('h1').get_text()) + + # Meta tags + temp = "" + for meta_name in ['keywords', 'description']: + meta = soup.find('meta', attrs={'name': meta_name}) + if meta and meta.get('content'): + query_parts.append(meta['content']) + temp += meta['content'] + + # If still empty, grab first significant paragraph + if not temp: + # Find the first tag P thatits text contains more than 50 characters + for p in body.find_all('p'): + if len(p.get_text()) > 150: + query_parts.append(p.get_text()[:150]) + break + + return ' '.join(filter(None, query_parts)) + + def extract_text_chunks(self, body: Tag, min_word_threshold: int = None) -> List[Tuple[str, str]]: + """ + Extracts text chunks from a BeautifulSoup body element while preserving order. + Returns list of tuples (text, tag_name) for classification. + + Args: + body: BeautifulSoup Tag object representing the body element + + Returns: + List of (text, tag_name) tuples + """ + # Tags to ignore - inline elements that shouldn't break text flow + INLINE_TAGS = { + 'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite', 'code', + 'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map', 'object', 'q', + 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup', + 'textarea', 'time', 'tt', 'var' + } + + # Tags that typically contain meaningful headers + HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header'} + + chunks = [] + current_text = [] + chunk_index = 0 + + def should_break_chunk(tag: Tag) -> bool: + """Determine if a tag should cause a break in the current text chunk""" + return ( + tag.name not in INLINE_TAGS + and not (tag.name == 'p' and len(current_text) == 0) + ) + + # Use deque for efficient push/pop operations + stack = deque([(body, False)]) + + while stack: + element, visited = stack.pop() + + if visited: + # End of block element - flush accumulated text + if current_text and should_break_chunk(element): + text = ' '.join(''.join(current_text).split()) + if text: + tag_type = 'header' if element.name in HEADER_TAGS else 'content' + chunks.append((chunk_index, text, tag_type, element)) + chunk_index += 1 + current_text = [] + continue + + if isinstance(element, NavigableString): + if str(element).strip(): + current_text.append(str(element).strip()) + continue + + # Pre-allocate children to avoid multiple list operations + children = list(element.children) + if not children: + continue + + # Mark block for revisit after processing children + stack.append((element, True)) + + # Add children in reverse order for correct processing + for child in reversed(children): + if isinstance(child, (Tag, NavigableString)): + stack.append((child, False)) + + # Handle any remaining text + if current_text: + text = ' '.join(''.join(current_text).split()) + if text: + chunks.append((chunk_index, text, 'content', body)) + + if min_word_threshold: + chunks = [chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold] + + return chunks + + def _deprecated_extract_text_chunks(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]: + """Common method for extracting text chunks""" + _text_cache = {} + def fast_text(element: Tag) -> str: + elem_id = id(element) + if elem_id in _text_cache: + return _text_cache[elem_id] + texts = [] + for content in element.contents: + if isinstance(content, str): + text = content.strip() + if text: + texts.append(text) + result = ' '.join(texts) + _text_cache[elem_id] = result + return result + + candidates = [] + index = 0 + + def dfs(element): + nonlocal index + if isinstance(element, Tag): + if element.name in self.included_tags: + if not self.is_excluded(element): + text = fast_text(element) + word_count = len(text.split()) + + # Headers pass through with adjusted minimum + if element.name in self.header_tags: + if word_count >= 3: # Minimal sanity check for headers + candidates.append((index, text, element)) + index += 1 + # Regular content uses standard minimum + elif word_count >= self.min_word_count: + candidates.append((index, text, element)) + index += 1 + + for child in element.children: + dfs(child) + + dfs(soup.body if soup.body else soup) + return candidates + + def is_excluded(self, tag: Tag) -> bool: + """Common method for exclusion logic""" + if tag.name in self.excluded_tags: + return True + class_id = ' '.join(filter(None, [ + ' '.join(tag.get('class', [])), + tag.get('id', '') + ])) + return bool(self.negative_patterns.search(class_id)) + + def clean_element(self, tag: Tag) -> str: + """Common method for cleaning HTML elements with minimal overhead""" + if not tag or not isinstance(tag, Tag): + return "" + + unwanted_tags = {'script', 'style', 'aside', 'form', 'iframe', 'noscript'} + unwanted_attrs = {'style', 'onclick', 'onmouseover', 'align', 'bgcolor', 'class', 'id'} + + # Use string builder pattern for better performance + builder = [] + + def render_tag(elem): + if not isinstance(elem, Tag): + if isinstance(elem, str): + builder.append(elem.strip()) + return + + if elem.name in unwanted_tags: + return + + # Start tag + builder.append(f'<{elem.name}') + + # Add cleaned attributes + attrs = {k: v for k, v in elem.attrs.items() if k not in unwanted_attrs} + for key, value in attrs.items(): + builder.append(f' {key}="{value}"') + + builder.append('>') + + # Process children + for child in elem.children: + render_tag(child) + + # Close tag + builder.append(f'') + + try: + render_tag(tag) + return ''.join(builder) + except Exception: + return str(tag) # Fallback to original if anything fails + +class BM25ContentFilter(RelevantContentFilter): + """ + Content filtering using BM25 algorithm with priority tag handling. + + How it works: + 1. Extracts page metadata with fallbacks. + 2. Extracts text chunks from the body element. + 3. Tokenizes the corpus and query. + 4. Applies BM25 algorithm to calculate scores for each chunk. + 5. Filters out chunks below the threshold. + 6. Sorts chunks by score in descending order. + 7. Returns the top N chunks. + + Attributes: + user_query (str): User query for filtering (optional). + bm25_threshold (float): BM25 threshold for filtering (default: 1.0). + language (str): Language for stemming (default: 'english'). + + Methods: + filter_content(self, html: str, min_word_threshold: int = None) + """ + def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'): + """ + Initializes the BM25ContentFilter class, if not provided, falls back to page metadata. + + Note: + If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph. + + Args: + user_query (str): User query for filtering (optional). + bm25_threshold (float): BM25 threshold for filtering (default: 1.0). + language (str): Language for stemming (default: 'english'). + """ + super().__init__(user_query=user_query) + self.bm25_threshold = bm25_threshold + self.priority_tags = { + 'h1': 5.0, + 'h2': 4.0, + 'h3': 3.0, + 'title': 4.0, + 'strong': 2.0, + 'b': 1.5, + 'em': 1.5, + 'blockquote': 2.0, + 'code': 2.0, + 'pre': 1.5, + 'th': 1.5, # Table headers + } + self.stemmer = stemmer(language) + + def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: + """ + Implements content filtering using BM25 algorithm with priority tag handling. + + Note: + This method implements the filtering logic for the BM25ContentFilter class. + It takes HTML content as input and returns a list of filtered text chunks. + + Args: + html (str): HTML content to be filtered. + min_word_threshold (int): Minimum word threshold for filtering (optional). + + Returns: + List[str]: List of filtered text chunks. + """ + if not html or not isinstance(html, str): + return [] + + soup = BeautifulSoup(html, 'lxml') + + # Check if body is present + if not soup.body: + # Wrap in body tag if missing + soup = BeautifulSoup(f'{html}', 'lxml') + body = soup.find('body') + + query = self.extract_page_query(soup, body) + + if not query: + return [] + # return [self.clean_element(soup)] + + candidates = self.extract_text_chunks(body, min_word_threshold) + + if not candidates: + return [] + + # Tokenize corpus + # tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates] + # tokenized_query = query.lower().split() + + # tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()] + # for _, chunk, _, _ in candidates] + # tokenized_query = [ps.stem(word) for word in query.lower().split()] + + tokenized_corpus = [[self.stemmer.stemWord(word) for word in chunk.lower().split()] + for _, chunk, _, _ in candidates] + tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()] + + # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())] + # for _, chunk, _, _ in candidates] + # tokenized_query = [self.stemmer.stemWord(word) for word in tokenize_text(query.lower())] + + # Clean from stop words and noise + tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus] + tokenized_query = clean_tokens(tokenized_query) + + bm25 = BM25Okapi(tokenized_corpus) + scores = bm25.get_scores(tokenized_query) + + # Adjust scores with tag weights + adjusted_candidates = [] + for score, (index, chunk, tag_type, tag) in zip(scores, candidates): + tag_weight = self.priority_tags.get(tag.name, 1.0) + adjusted_score = score * tag_weight + adjusted_candidates.append((adjusted_score, index, chunk, tag)) + + # Filter candidates by threshold + selected_candidates = [ + (index, chunk, tag) for adjusted_score, index, chunk, tag in adjusted_candidates + if adjusted_score >= self.bm25_threshold + ] + + if not selected_candidates: + return [] + + # Sort selected candidates by original document order + selected_candidates.sort(key=lambda x: x[0]) + + return [self.clean_element(tag) for _, _, tag in selected_candidates] + +class PruningContentFilter(RelevantContentFilter): + """ + Content filtering using pruning algorithm with dynamic threshold. + + How it works: + 1. Extracts page metadata with fallbacks. + 2. Extracts text chunks from the body element. + 3. Applies pruning algorithm to calculate scores for each chunk. + 4. Filters out chunks below the threshold. + 5. Sorts chunks by score in descending order. + 6. Returns the top N chunks. + + Attributes: + user_query (str): User query for filtering (optional), if not provided, falls back to page metadata. + min_word_threshold (int): Minimum word threshold for filtering (optional). + threshold_type (str): Threshold type for dynamic threshold (default: 'fixed'). + threshold (float): Fixed threshold value (default: 0.48). + + Methods: + filter_content(self, html: str, min_word_threshold: int = None): + """ + def __init__(self, user_query: str = None, min_word_threshold: int = None, + threshold_type: str = 'fixed', threshold: float = 0.48): + """ + Initializes the PruningContentFilter class, if not provided, falls back to page metadata. + + Note: + If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph. + + Args: + user_query (str): User query for filtering (optional). + min_word_threshold (int): Minimum word threshold for filtering (optional). + threshold_type (str): Threshold type for dynamic threshold (default: 'fixed'). + threshold (float): Fixed threshold value (default: 0.48). + """ + super().__init__(None) + self.min_word_threshold = min_word_threshold + self.threshold_type = threshold_type + self.threshold = threshold + + # Add tag importance for dynamic threshold + self.tag_importance = { + 'article': 1.5, + 'main': 1.4, + 'section': 1.3, + 'p': 1.2, + 'h1': 1.4, + 'h2': 1.3, + 'h3': 1.2, + 'div': 0.7, + 'span': 0.6 + } + + # Metric configuration + self.metric_config = { + 'text_density': True, + 'link_density': True, + 'tag_weight': True, + 'class_id_weight': True, + 'text_length': True, + } + + self.metric_weights = { + 'text_density': 0.4, + 'link_density': 0.2, + 'tag_weight': 0.2, + 'class_id_weight': 0.1, + 'text_length': 0.1, + } + + self.tag_weights = { + 'div': 0.5, + 'p': 1.0, + 'article': 1.5, + 'section': 1.0, + 'span': 0.3, + 'li': 0.5, + 'ul': 0.5, + 'ol': 0.5, + 'h1': 1.2, + 'h2': 1.1, + 'h3': 1.0, + 'h4': 0.9, + 'h5': 0.8, + 'h6': 0.7, + } + + def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: + """ + Implements content filtering using pruning algorithm with dynamic threshold. + + Note: + This method implements the filtering logic for the PruningContentFilter class. + It takes HTML content as input and returns a list of filtered text chunks. + + Args: + html (str): HTML content to be filtered. + min_word_threshold (int): Minimum word threshold for filtering (optional). + + Returns: + List[str]: List of filtered text chunks. + """ + if not html or not isinstance(html, str): + return [] + + soup = BeautifulSoup(html, 'lxml') + if not soup.body: + soup = BeautifulSoup(f'{html}', 'lxml') + + # Remove comments and unwanted tags + self._remove_comments(soup) + self._remove_unwanted_tags(soup) + + # Prune tree starting from body + body = soup.find('body') + self._prune_tree(body) + + # Extract remaining content as list of HTML strings + content_blocks = [] + for element in body.children: + if isinstance(element, str) or not hasattr(element, 'name'): + continue + if len(element.get_text(strip=True)) > 0: + content_blocks.append(str(element)) + + return content_blocks + + def _remove_comments(self, soup): + """Removes HTML comments""" + for element in soup(text=lambda text: isinstance(text, Comment)): + element.extract() + + def _remove_unwanted_tags(self, soup): + """Removes unwanted tags""" + for tag in self.excluded_tags: + for element in soup.find_all(tag): + element.decompose() + + def _prune_tree(self, node): + """ + Prunes the tree starting from the given node. + + Args: + node (Tag): The node from which the pruning starts. + """ + if not node or not hasattr(node, 'name') or node.name is None: + return + + text_len = len(node.get_text(strip=True)) + tag_len = len(node.encode_contents().decode('utf-8')) + link_text_len = sum(len(s.strip()) for s in (a.string for a in node.find_all('a', recursive=False)) if s) + + metrics = { + 'node': node, + 'tag_name': node.name, + 'text_len': text_len, + 'tag_len': tag_len, + 'link_text_len': link_text_len + } + + score = self._compute_composite_score(metrics, text_len, tag_len, link_text_len) + + if self.threshold_type == 'fixed': + should_remove = score < self.threshold + else: # dynamic + tag_importance = self.tag_importance.get(node.name, 0.7) + text_ratio = text_len / tag_len if tag_len > 0 else 0 + link_ratio = link_text_len / text_len if text_len > 0 else 1 + + threshold = self.threshold # base threshold + if tag_importance > 1: + threshold *= 0.8 + if text_ratio > 0.4: + threshold *= 0.9 + if link_ratio > 0.6: + threshold *= 1.2 + + should_remove = score < threshold + + if should_remove: + node.decompose() + else: + children = [child for child in node.children if hasattr(child, 'name')] + for child in children: + self._prune_tree(child) + + def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len): + """Computes the composite score""" + if self.min_word_threshold: + # Get raw text from metrics node - avoid extra processing + text = metrics['node'].get_text(strip=True) + word_count = text.count(' ') + 1 + if word_count < self.min_word_threshold: + return -1.0 # Guaranteed removal + score = 0.0 + total_weight = 0.0 + + if self.metric_config['text_density']: + density = text_len / tag_len if tag_len > 0 else 0 + score += self.metric_weights['text_density'] * density + total_weight += self.metric_weights['text_density'] + + if self.metric_config['link_density']: + density = 1 - (link_text_len / text_len if text_len > 0 else 0) + score += self.metric_weights['link_density'] * density + total_weight += self.metric_weights['link_density'] + + if self.metric_config['tag_weight']: + tag_score = self.tag_weights.get(metrics['tag_name'], 0.5) + score += self.metric_weights['tag_weight'] * tag_score + total_weight += self.metric_weights['tag_weight'] + + if self.metric_config['class_id_weight']: + class_score = self._compute_class_id_weight(metrics['node']) + score += self.metric_weights['class_id_weight'] * max(0, class_score) + total_weight += self.metric_weights['class_id_weight'] + + if self.metric_config['text_length']: + score += self.metric_weights['text_length'] * math.log(text_len + 1) + total_weight += self.metric_weights['text_length'] + + return score / total_weight if total_weight > 0 else 0 + + def _compute_class_id_weight(self, node): + """Computes the class ID weight""" + class_id_score = 0 + if 'class' in node.attrs: + classes = ' '.join(node['class']) + if self.negative_patterns.match(classes): + class_id_score -= 0.5 + if 'id' in node.attrs: + element_id = node['id'] + if self.negative_patterns.match(element_id): + class_id_score -= 0.5 + return class_id_score \ No newline at end of file diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py new file mode 100644 index 0000000000000000000000000000000000000000..f3a96cf3ee4bbb2c0ffe3622cd8ba5e2a2aaf5c1 --- /dev/null +++ b/crawl4ai/content_scraping_strategy.py @@ -0,0 +1,723 @@ +import re # Point 1: Pre-Compile Regular Expressions +import time +from abc import ABC, abstractmethod +from typing import Dict, Any, Optional +from bs4 import BeautifulSoup +from concurrent.futures import ThreadPoolExecutor +import asyncio, requests, re, os +from .config import * +from bs4 import element, NavigableString, Comment +from bs4 import PageElement, Tag +from urllib.parse import urljoin +from requests.exceptions import InvalidSchema +# from .content_cleaning_strategy import ContentCleaningStrategy +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter +from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator +from .models import MarkdownGenerationResult +from .utils import ( + extract_metadata, + normalize_url, + is_external_url, + get_base_domain, +) + + +# Pre-compile regular expressions for Open Graph and Twitter metadata +OG_REGEX = re.compile(r'^og:') +TWITTER_REGEX = re.compile(r'^twitter:') +DIMENSION_REGEX = re.compile(r"(\d+)(\D*)") + +# Function to parse image height/width value and units +def parse_dimension(dimension): + if dimension: + # match = re.match(r"(\d+)(\D*)", dimension) + match = DIMENSION_REGEX.match(dimension) + if match: + number = int(match.group(1)) + unit = match.group(2) or 'px' # Default unit is 'px' if not specified + return number, unit + return None, None + +# Fetch image file metadata to extract size and extension +def fetch_image_file_size(img, base_url): + #If src is relative path construct full URL, if not it may be CDN URL + img_url = urljoin(base_url,img.get('src')) + try: + response = requests.head(img_url) + if response.status_code == 200: + return response.headers.get('Content-Length',None) + else: + print(f"Failed to retrieve file size for {img_url}") + return None + except InvalidSchema as e: + return None + finally: + return + +class ContentScrapingStrategy(ABC): + @abstractmethod + def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: + pass + + @abstractmethod + async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: + pass + +class WebScrapingStrategy(ContentScrapingStrategy): + """ + Class for web content scraping. Perhaps the most important class. + + How it works: + 1. Extract content from HTML using BeautifulSoup. + 2. Clean the extracted content using a content cleaning strategy. + 3. Filter the cleaned content using a content filtering strategy. + 4. Generate markdown content from the filtered content. + 5. Return the markdown content. + """ + + def __init__(self, logger=None): + self.logger = logger + + def _log(self, level, message, tag="SCRAPE", **kwargs): + """Helper method to safely use logger.""" + if self.logger: + log_method = getattr(self.logger, level) + log_method(message=message, tag=tag, **kwargs) + + def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: + """ + Main entry point for content scraping. + + Args: + url (str): The URL of the page to scrape. + html (str): The HTML content of the page. + **kwargs: Additional keyword arguments. + + Returns: + Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys: + + - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'. + - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'. + - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'. + - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown' + """ + return self._scrap(url, html, is_async=False, **kwargs) + + async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: + """ + Main entry point for asynchronous content scraping. + + Args: + url (str): The URL of the page to scrape. + html (str): The HTML content of the page. + **kwargs: Additional keyword arguments. + + Returns: + Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys: + + - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'. + - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'. + - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'. + - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown' + """ + return await asyncio.to_thread(self._scrap, url, html, **kwargs) + + def flatten_nested_elements(self, node): + """ + Flatten nested elements in a HTML tree. + + Args: + node (Tag): The root node of the HTML tree. + + Returns: + Tag: The flattened HTML tree. + """ + if isinstance(node, NavigableString): + return node + if len(node.contents) == 1 and isinstance(node.contents[0], Tag) and node.contents[0].name == node.name: + return self.flatten_nested_elements(node.contents[0]) + node.contents = [self.flatten_nested_elements(child) for child in node.contents] + return node + + def find_closest_parent_with_useful_text(self, tag, **kwargs): + """ + Find the closest parent with useful text. + + Args: + tag (Tag): The starting tag to search from. + **kwargs: Additional keyword arguments. + + Returns: + Tag: The closest parent with useful text, or None if not found. + """ + image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) + current_tag = tag + while current_tag: + current_tag = current_tag.parent + # Get the text content of the parent tag + if current_tag: + text_content = current_tag.get_text(separator=' ',strip=True) + # Check if the text content has at least word_count_threshold + if len(text_content.split()) >= image_description_min_word_threshold: + return text_content + return None + + def remove_unwanted_attributes(self, element, important_attrs, keep_data_attributes=False): + """ + Remove unwanted attributes from an HTML element. + + Args: + element (Tag): The HTML element to remove attributes from. + important_attrs (list): List of important attributes to keep. + keep_data_attributes (bool): Whether to keep data attributes. + + Returns: + None + """ + attrs_to_remove = [] + for attr in element.attrs: + if attr not in important_attrs: + if keep_data_attributes: + if not attr.startswith('data-'): + attrs_to_remove.append(attr) + else: + attrs_to_remove.append(attr) + + for attr in attrs_to_remove: + del element[attr] + + def process_image(self, img, url, index, total_images, **kwargs): + """ + Process an image element. + + How it works: + 1. Check if the image has valid display and inside undesired html elements. + 2. Score an image for it's usefulness. + 3. Extract image file metadata to extract size and extension. + 4. Generate a dictionary with the processed image information. + 5. Return the processed image information. + + Args: + img (Tag): The image element to process. + url (str): The URL of the page containing the image. + index (int): The index of the image in the list of images. + total_images (int): The total number of images in the list. + **kwargs: Additional keyword arguments. + + Returns: + dict: A dictionary containing the processed image information. + """ + parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w') + if ' ' in u else None} + for u in [f"http{p}" for p in s.split("http") if p]] + + # Constants for checks + classes_to_check = frozenset(['button', 'icon', 'logo']) + tags_to_check = frozenset(['button', 'input']) + image_formats = frozenset(['jpg', 'jpeg', 'png', 'webp', 'avif', 'gif']) + + # Pre-fetch commonly used attributes + style = img.get('style', '') + alt = img.get('alt', '') + src = img.get('src', '') + data_src = img.get('data-src', '') + srcset = img.get('srcset', '') + data_srcset = img.get('data-srcset', '') + width = img.get('width') + height = img.get('height') + parent = img.parent + parent_classes = parent.get('class', []) + + # Quick validation checks + if ('display:none' in style or + parent.name in tags_to_check or + any(c in cls for c in parent_classes for cls in classes_to_check) or + any(c in src for c in classes_to_check) or + any(c in alt for c in classes_to_check)): + return None + + # Quick score calculation + score = 0 + if width and width.isdigit(): + width_val = int(width) + score += 1 if width_val > 150 else 0 + if height and height.isdigit(): + height_val = int(height) + score += 1 if height_val > 150 else 0 + if alt: + score += 1 + score += index/total_images < 0.5 + + # image_format = '' + # if "data:image/" in src: + # image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0] + # else: + # image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0] + + # if image_format in ('jpg', 'png', 'webp', 'avif'): + # score += 1 + + + # Check for image format in all possible sources + def has_image_format(url): + return any(fmt in url.lower() for fmt in image_formats) + + # Score for having proper image sources + if any(has_image_format(url) for url in [src, data_src, srcset, data_srcset]): + score += 1 + if srcset or data_srcset: + score += 1 + if img.find_parent('picture'): + score += 1 + + # Detect format from any available source + detected_format = None + for url in [src, data_src, srcset, data_srcset]: + if url: + format_matches = [fmt for fmt in image_formats if fmt in url.lower()] + if format_matches: + detected_format = format_matches[0] + break + + if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD): + return None + + # Use set for deduplication + unique_urls = set() + image_variants = [] + + # Generate a unique group ID for this set of variants + group_id = index + + # Base image info template + image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) + base_info = { + 'alt': alt, + 'desc': self.find_closest_parent_with_useful_text(img, **kwargs), + 'score': score, + 'type': 'image', + 'group_id': group_id, # Group ID for this set of variants + 'format': detected_format, + } + + # Inline function for adding variants + def add_variant(src, width=None): + if src and not src.startswith('data:') and src not in unique_urls: + unique_urls.add(src) + image_variants.append({**base_info, 'src': src, 'width': width}) + + # Process all sources + add_variant(src) + add_variant(data_src) + + # Handle srcset and data-srcset in one pass + for attr in ('srcset', 'data-srcset'): + if value := img.get(attr): + for source in parse_srcset(value): + add_variant(source['url'], source['width']) + + # Quick picture element check + if picture := img.find_parent('picture'): + for source in picture.find_all('source'): + if srcset := source.get('srcset'): + for src in parse_srcset(srcset): + add_variant(src['url'], src['width']) + + # Framework-specific attributes in one pass + for attr, value in img.attrs.items(): + if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value: + add_variant(value) + + return image_variants if image_variants else None + + def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]: + """ + Process an HTML element. + + How it works: + 1. Check if the element is an image, video, or audio. + 2. Extract the element's attributes and content. + 3. Process the element based on its type. + 4. Return the processed element information. + + Args: + url (str): The URL of the page containing the element. + element (Tag): The HTML element to process. + **kwargs: Additional keyword arguments. + + Returns: + dict: A dictionary containing the processed element information. + """ + media = {'images': [], 'videos': [], 'audios': []} + internal_links_dict = {} + external_links_dict = {} + self._process_element( + url, + element, + media, + internal_links_dict, + external_links_dict, + **kwargs + ) + return { + 'media': media, + 'internal_links_dict': internal_links_dict, + 'external_links_dict': external_links_dict + } + + def _process_element(self, url, element: PageElement, media: Dict[str, Any], internal_links_dict: Dict[str, Any], external_links_dict: Dict[str, Any], **kwargs) -> bool: + """ + Process an HTML element. + """ + try: + if isinstance(element, NavigableString): + if isinstance(element, Comment): + element.extract() + return False + + # if element.name == 'img': + # process_image(element, url, 0, 1) + # return True + base_domain = kwargs.get("base_domain", get_base_domain(url)) + + if element.name in ['script', 'style', 'link', 'meta', 'noscript']: + element.decompose() + return False + + keep_element = False + + exclude_domains = kwargs.get('exclude_domains', []) + # exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS)) + # exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', []) + # exclude_social_media_domains = list(set(exclude_social_media_domains)) + + try: + if element.name == 'a' and element.get('href'): + href = element.get('href', '').strip() + if not href: # Skip empty hrefs + return False + + url_base = url.split('/')[2] + + # Normalize the URL + try: + normalized_href = normalize_url(href, url) + except ValueError as e: + # logging.warning(f"Invalid URL format: {href}, Error: {str(e)}") + return False + + link_data = { + 'href': normalized_href, + 'text': element.get_text().strip(), + 'title': element.get('title', '').strip(), + 'base_domain': base_domain + } + + is_external = is_external_url(normalized_href, base_domain) + + keep_element = True + + # Handle external link exclusions + if is_external: + link_base_domain = get_base_domain(normalized_href) + link_data['base_domain'] = link_base_domain + if kwargs.get('exclude_external_links', False): + element.decompose() + return False + # elif kwargs.get('exclude_social_media_links', False): + # if link_base_domain in exclude_social_media_domains: + # element.decompose() + # return False + # if any(domain in normalized_href.lower() for domain in exclude_social_media_domains): + # element.decompose() + # return False + elif exclude_domains: + if link_base_domain in exclude_domains: + element.decompose() + return False + # if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])): + # element.decompose() + # return False + + if is_external: + if normalized_href not in external_links_dict: + external_links_dict[normalized_href] = link_data + else: + if normalized_href not in internal_links_dict: + internal_links_dict[normalized_href] = link_data + + + except Exception as e: + raise Exception(f"Error processing links: {str(e)}") + + try: + if element.name == 'img': + potential_sources = ['src', 'data-src', 'srcset' 'data-lazy-src', 'data-original'] + src = element.get('src', '') + while not src and potential_sources: + src = element.get(potential_sources.pop(0), '') + if not src: + element.decompose() + return False + + # If it is srcset pick up the first image + if 'srcset' in element.attrs: + src = element.attrs['srcset'].split(',')[0].split(' ')[0] + + # If image src is internal, then skip + if not is_external_url(src, base_domain): + return True + + image_src_base_domain = get_base_domain(src) + + # Check flag if we should remove external images + if kwargs.get('exclude_external_images', False): + element.decompose() + return False + # src_url_base = src.split('/')[2] + # url_base = url.split('/')[2] + # if url_base not in src_url_base: + # element.decompose() + # return False + + # if kwargs.get('exclude_social_media_links', False): + # if image_src_base_domain in exclude_social_media_domains: + # element.decompose() + # return False + # src_url_base = src.split('/')[2] + # url_base = url.split('/')[2] + # if any(domain in src for domain in exclude_social_media_domains): + # element.decompose() + # return False + + # Handle exclude domains + if exclude_domains: + if image_src_base_domain in exclude_domains: + element.decompose() + return False + # if any(domain in src for domain in kwargs.get('exclude_domains', [])): + # element.decompose() + # return False + + return True # Always keep image elements + except Exception as e: + raise "Error processing images" + + + # Check if flag to remove all forms is set + if kwargs.get('remove_forms', False) and element.name == 'form': + element.decompose() + return False + + if element.name in ['video', 'audio']: + media[f"{element.name}s"].append({ + 'src': element.get('src'), + 'alt': element.get('alt'), + 'type': element.name, + 'description': self.find_closest_parent_with_useful_text(element, **kwargs) + }) + source_tags = element.find_all('source') + for source_tag in source_tags: + media[f"{element.name}s"].append({ + 'src': source_tag.get('src'), + 'alt': element.get('alt'), + 'type': element.name, + 'description': self.find_closest_parent_with_useful_text(element, **kwargs) + }) + return True # Always keep video and audio elements + + if element.name in ONLY_TEXT_ELIGIBLE_TAGS: + if kwargs.get('only_text', False): + element.replace_with(element.get_text()) + + try: + self.remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False)) + except Exception as e: + # print('Error removing unwanted attributes:', str(e)) + self._log('error', + message="Error removing unwanted attributes: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + # Process children + for child in list(element.children): + if isinstance(child, NavigableString) and not isinstance(child, Comment): + if len(child.strip()) > 0: + keep_element = True + else: + if self._process_element(url, child, media, internal_links_dict, external_links_dict, **kwargs): + keep_element = True + + + # Check word count + word_count_threshold = kwargs.get('word_count_threshold', MIN_WORD_THRESHOLD) + if not keep_element: + word_count = len(element.get_text(strip=True).split()) + keep_element = word_count >= word_count_threshold + + if not keep_element: + element.decompose() + + return keep_element + except Exception as e: + # print('Error processing element:', str(e)) + self._log('error', + message="Error processing element: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + return False + + def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]: + """ + Extract content from HTML using BeautifulSoup. + + Args: + url (str): The URL of the page to scrape. + html (str): The HTML content of the page to scrape. + word_count_threshold (int): The minimum word count threshold for content extraction. + css_selector (str): The CSS selector to use for content extraction. + **kwargs: Additional keyword arguments. + + Returns: + dict: A dictionary containing the extracted content. + """ + success = True + if not html: + return None + + parser_type = kwargs.get('parser', 'lxml') + soup = BeautifulSoup(html, parser_type) + body = soup.body + base_domain = get_base_domain(url) + + try: + meta = extract_metadata("", soup) + except Exception as e: + self._log('error', + message="Error extracting metadata: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + meta = {} + + # Handle tag-based removal first - faster than CSS selection + excluded_tags = set(kwargs.get('excluded_tags', []) or []) + if excluded_tags: + for element in body.find_all(lambda tag: tag.name in excluded_tags): + element.extract() + + # Handle CSS selector-based removal + excluded_selector = kwargs.get('excluded_selector', '') + if excluded_selector: + is_single_selector = ',' not in excluded_selector and ' ' not in excluded_selector + if is_single_selector: + while element := body.select_one(excluded_selector): + element.extract() + else: + for element in body.select(excluded_selector): + element.extract() + + if css_selector: + selected_elements = body.select(css_selector) + if not selected_elements: + return { + 'markdown': '', + 'cleaned_html': '', + 'success': True, + 'media': {'images': [], 'videos': [], 'audios': []}, + 'links': {'internal': [], 'external': []}, + 'metadata': {}, + 'message': f"No elements found for CSS selector: {css_selector}" + } + # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}") + body = soup.new_tag('div') + for el in selected_elements: + body.append(el) + + kwargs['exclude_social_media_domains'] = set(kwargs.get('exclude_social_media_domains', []) + SOCIAL_MEDIA_DOMAINS) + kwargs['exclude_domains'] = set(kwargs.get('exclude_domains', [])) + if kwargs.get('exclude_social_media_links', False): + kwargs['exclude_domains'] = kwargs['exclude_domains'].union(kwargs['exclude_social_media_domains']) + + result_obj = self.process_element( + url, + body, + word_count_threshold = word_count_threshold, + base_domain=base_domain, + **kwargs + ) + + links = {'internal': [], 'external': []} + media = result_obj['media'] + internal_links_dict = result_obj['internal_links_dict'] + external_links_dict = result_obj['external_links_dict'] + + # Update the links dictionary with unique links + links['internal'] = list(internal_links_dict.values()) + links['external'] = list(external_links_dict.values()) + + # # Process images using ThreadPoolExecutor + imgs = body.find_all('img') + + media['images'] = [ + img for result in (self.process_image(img, url, i, len(imgs)) + for i, img in enumerate(imgs)) + if result is not None + for img in result + ] + + body = self.flatten_nested_elements(body) + base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)') + for img in imgs: + src = img.get('src', '') + if base64_pattern.match(src): + # Replace base64 data with empty string + img['src'] = base64_pattern.sub('', src) + + str_body = "" + try: + str_body = body.encode_contents().decode('utf-8') + except Exception as e: + # Reset body to the original HTML + success = False + body = BeautifulSoup(html, 'html.parser') + + # Create a new div with a special ID + error_div = body.new_tag('div', id='crawl4ai_error_message') + error_div.string = ''' + Crawl4AI Error: This page is not fully supported. + + Possible reasons: + 1. The page may have restrictions that prevent crawling. + 2. The page might not be fully loaded. + + Suggestions: + - Try calling the crawl function with these parameters: + magic=True, + - Set headless=False to visualize what's happening on the page. + + If the issue persists, please check the page's structure and any potential anti-crawling measures. + ''' + + # Append the error div to the body + body.body.append(error_div) + str_body = body.encode_contents().decode('utf-8') + + print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.") + self._log('error', + message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.", + tag="SCRAPE" + ) + + cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') + + + return { + # **markdown_content, + 'cleaned_html': cleaned_html, + 'success': success, + 'media': media, + 'links': links, + 'metadata': meta + } diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py new file mode 100644 index 0000000000000000000000000000000000000000..898dcfa8136f314abc49eb1c262b937ab5f605d2 --- /dev/null +++ b/crawl4ai/crawler_strategy.py @@ -0,0 +1,360 @@ +from abc import ABC, abstractmethod +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.chrome.options import Options +from selenium.common.exceptions import InvalidArgumentException, WebDriverException +# from selenium.webdriver.chrome.service import Service as ChromeService +# from webdriver_manager.chrome import ChromeDriverManager +# from urllib3.exceptions import MaxRetryError + +from .config import * +import logging, time +import base64 +from PIL import Image, ImageDraw, ImageFont +from io import BytesIO +from typing import List, Callable +import requests +import os +from pathlib import Path +from .utils import * + +logger = logging.getLogger('selenium.webdriver.remote.remote_connection') +logger.setLevel(logging.WARNING) + +logger_driver = logging.getLogger('selenium.webdriver.common.service') +logger_driver.setLevel(logging.WARNING) + +urllib3_logger = logging.getLogger('urllib3.connectionpool') +urllib3_logger.setLevel(logging.WARNING) + +# Disable http.client logging +http_client_logger = logging.getLogger('http.client') +http_client_logger.setLevel(logging.WARNING) + +# Disable driver_finder and service logging +driver_finder_logger = logging.getLogger('selenium.webdriver.common.driver_finder') +driver_finder_logger.setLevel(logging.WARNING) + + + + +class CrawlerStrategy(ABC): + @abstractmethod + def crawl(self, url: str, **kwargs) -> str: + pass + + @abstractmethod + def take_screenshot(self, save_path: str): + pass + + @abstractmethod + def update_user_agent(self, user_agent: str): + pass + + @abstractmethod + def set_hook(self, hook_type: str, hook: Callable): + pass + +class CloudCrawlerStrategy(CrawlerStrategy): + def __init__(self, use_cached_html = False): + super().__init__() + self.use_cached_html = use_cached_html + + def crawl(self, url: str) -> str: + data = { + "urls": [url], + "include_raw_html": True, + "forced": True, + "extract_blocks": False, + } + + response = requests.post("http://crawl4ai.uccode.io/crawl", json=data) + response = response.json() + html = response["results"][0]["html"] + return sanitize_input_encode(html) + +class LocalSeleniumCrawlerStrategy(CrawlerStrategy): + def __init__(self, use_cached_html=False, js_code=None, **kwargs): + super().__init__() + print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy") + self.options = Options() + self.options.headless = True + if kwargs.get("proxy"): + self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy"))) + if kwargs.get("user_agent"): + self.options.add_argument("--user-agent=" + kwargs.get("user_agent")) + else: + user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") + self.options.add_argument(f"--user-agent={user_agent}") + self.options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") + + self.options.headless = kwargs.get("headless", True) + if self.options.headless: + self.options.add_argument("--headless") + + self.options.add_argument("--disable-gpu") + self.options.add_argument("--window-size=1920,1080") + self.options.add_argument("--no-sandbox") + self.options.add_argument("--disable-dev-shm-usage") + self.options.add_argument("--disable-blink-features=AutomationControlled") + + # self.options.add_argument("--disable-dev-shm-usage") + self.options.add_argument("--disable-gpu") + # self.options.add_argument("--disable-extensions") + # self.options.add_argument("--disable-infobars") + # self.options.add_argument("--disable-logging") + # self.options.add_argument("--disable-popup-blocking") + # self.options.add_argument("--disable-translate") + # self.options.add_argument("--disable-default-apps") + # self.options.add_argument("--disable-background-networking") + # self.options.add_argument("--disable-sync") + # self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess") + # self.options.add_argument("--disable-browser-side-navigation") + # self.options.add_argument("--dns-prefetch-disable") + # self.options.add_argument("--disable-web-security") + self.options.add_argument("--log-level=3") + self.use_cached_html = use_cached_html + self.use_cached_html = use_cached_html + self.js_code = js_code + self.verbose = kwargs.get("verbose", False) + + # Hooks + self.hooks = { + 'on_driver_created': None, + 'on_user_agent_updated': None, + 'before_get_url': None, + 'after_get_url': None, + 'before_return_html': None + } + + # chromedriver_autoinstaller.install() + # import chromedriver_autoinstaller + # crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") + # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options) + # chromedriver_path = chromedriver_autoinstaller.install() + # chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver() + # self.service = Service(chromedriver_autoinstaller.install()) + + + # chromedriver_path = ChromeDriverManager().install() + # self.service = Service(chromedriver_path) + # self.service.log_path = "NUL" + # self.driver = webdriver.Chrome(service=self.service, options=self.options) + + # Use selenium-manager (built into Selenium 4.10.0+) + self.service = Service() + self.driver = webdriver.Chrome(options=self.options) + + self.driver = self.execute_hook('on_driver_created', self.driver) + + if kwargs.get("cookies"): + for cookie in kwargs.get("cookies"): + self.driver.add_cookie(cookie) + + + + def set_hook(self, hook_type: str, hook: Callable): + if hook_type in self.hooks: + self.hooks[hook_type] = hook + else: + raise ValueError(f"Invalid hook type: {hook_type}") + + def execute_hook(self, hook_type: str, *args): + hook = self.hooks.get(hook_type) + if hook: + result = hook(*args) + if result is not None: + if isinstance(result, webdriver.Chrome): + return result + else: + raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.") + # If the hook returns None or there is no hook, return self.driver + return self.driver + + def update_user_agent(self, user_agent: str): + self.options.add_argument(f"user-agent={user_agent}") + self.driver.quit() + self.driver = webdriver.Chrome(service=self.service, options=self.options) + self.driver = self.execute_hook('on_user_agent_updated', self.driver) + + def set_custom_headers(self, headers: dict): + # Enable Network domain for sending headers + self.driver.execute_cdp_cmd('Network.enable', {}) + # Set extra HTTP headers + self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers}) + + def _ensure_page_load(self, max_checks=6, check_interval=0.01): + initial_length = len(self.driver.page_source) + + for ix in range(max_checks): + # print(f"Checking page load: {ix}") + time.sleep(check_interval) + current_length = len(self.driver.page_source) + + if current_length != initial_length: + break + + return self.driver.page_source + + def crawl(self, url: str, **kwargs) -> str: + # Create md5 hash of the URL + import hashlib + url_hash = hashlib.md5(url.encode()).hexdigest() + + if self.use_cached_html: + cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash) + if os.path.exists(cache_file_path): + with open(cache_file_path, "r") as f: + return sanitize_input_encode(f.read()) + + try: + self.driver = self.execute_hook('before_get_url', self.driver) + if self.verbose: + print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...") + self.driver.get(url) # + + WebDriverWait(self.driver, 20).until( + lambda d: d.execute_script('return document.readyState') == 'complete' + ) + WebDriverWait(self.driver, 10).until( + EC.presence_of_all_elements_located((By.TAG_NAME, "body")) + ) + + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + + self.driver = self.execute_hook('after_get_url', self.driver) + html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source + can_not_be_done_headless = False # Look at my creativity for naming variables + + # TODO: Very ugly approach, but promise to change it! + if kwargs.get('bypass_headless', False) or html == "": + print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...") + can_not_be_done_headless = True + options = Options() + options.headless = False + # set window size very small + options.add_argument("--window-size=5,5") + driver = webdriver.Chrome(service=self.service, options=options) + driver.get(url) + self.driver = self.execute_hook('after_get_url', driver) + html = sanitize_input_encode(driver.page_source) + driver.quit() + + # Execute JS code if provided + self.js_code = kwargs.get("js_code", self.js_code) + if self.js_code and type(self.js_code) == str: + self.driver.execute_script(self.js_code) + # Optionally, wait for some condition after executing the JS code + WebDriverWait(self.driver, 10).until( + lambda driver: driver.execute_script("return document.readyState") == "complete" + ) + elif self.js_code and type(self.js_code) == list: + for js in self.js_code: + self.driver.execute_script(js) + WebDriverWait(self.driver, 10).until( + lambda driver: driver.execute_script("return document.readyState") == "complete" + ) + + # Optionally, wait for some condition after executing the JS code : Contributed by (https://github.com/jonymusky) + wait_for = kwargs.get('wait_for', False) + if wait_for: + if callable(wait_for): + print("[LOG] 🔄 Waiting for condition...") + WebDriverWait(self.driver, 20).until(wait_for) + else: + print("[LOG] 🔄 Waiting for condition...") + WebDriverWait(self.driver, 20).until( + EC.presence_of_element_located((By.CSS_SELECTOR, wait_for)) + ) + + if not can_not_be_done_headless: + html = sanitize_input_encode(self.driver.page_source) + self.driver = self.execute_hook('before_return_html', self.driver, html) + + # Store in cache + cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash) + with open(cache_file_path, "w", encoding="utf-8") as f: + f.write(html) + + if self.verbose: + print(f"[LOG] ✅ Crawled {url} successfully!") + + return html + except InvalidArgumentException as e: + if not hasattr(e, 'msg'): + e.msg = sanitize_input_encode(str(e)) + raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}") + except WebDriverException as e: + # If e does nlt have msg attribute create it and set it to str(e) + if not hasattr(e, 'msg'): + e.msg = sanitize_input_encode(str(e)) + raise WebDriverException(f"Failed to crawl {url}: {e.msg}") + except Exception as e: + if not hasattr(e, 'msg'): + e.msg = sanitize_input_encode(str(e)) + raise Exception(f"Failed to crawl {url}: {e.msg}") + + def take_screenshot(self) -> str: + try: + # Get the dimensions of the page + total_width = self.driver.execute_script("return document.body.scrollWidth") + total_height = self.driver.execute_script("return document.body.scrollHeight") + + # Set the window size to the dimensions of the page + self.driver.set_window_size(total_width, total_height) + + # Take screenshot + screenshot = self.driver.get_screenshot_as_png() + + # Open the screenshot with PIL + image = Image.open(BytesIO(screenshot)) + + # Convert image to RGB mode (this will handle both RGB and RGBA images) + rgb_image = image.convert('RGB') + + # Convert to JPEG and compress + buffered = BytesIO() + rgb_image.save(buffered, format="JPEG", quality=85) + img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8') + + if self.verbose: + print(f"[LOG] 📸 Screenshot taken and converted to base64") + + return img_base64 + except Exception as e: + error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}") + print(error_message) + + # Generate an image with black background + img = Image.new('RGB', (800, 600), color='black') + draw = ImageDraw.Draw(img) + + # Load a font + try: + font = ImageFont.truetype("arial.ttf", 40) + except IOError: + font = ImageFont.load_default() + + # Define text color and wrap the text + text_color = (255, 255, 255) + max_width = 780 + wrapped_text = wrap_text(draw, error_message, font, max_width) + + # Calculate text position + text_position = (10, 10) + + # Draw the text on the image + draw.text(text_position, wrapped_text, fill=text_color, font=font) + + # Convert to base64 + buffered = BytesIO() + img.save(buffered, format="JPEG") + img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8') + + return img_base64 + + def quit(self): + self.driver.quit() diff --git a/crawl4ai/database.py b/crawl4ai/database.py new file mode 100644 index 0000000000000000000000000000000000000000..42ad70174c1d7b1d7d7a2cb3c4f946dce4683907 --- /dev/null +++ b/crawl4ai/database.py @@ -0,0 +1,135 @@ +import os +from pathlib import Path +import sqlite3 +from typing import Optional, Tuple + +DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") +os.makedirs(DB_PATH, exist_ok=True) +DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") + +def init_db(): + global DB_PATH + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + cursor.execute(''' + CREATE TABLE IF NOT EXISTS crawled_data ( + url TEXT PRIMARY KEY, + html TEXT, + cleaned_html TEXT, + markdown TEXT, + extracted_content TEXT, + success BOOLEAN, + media TEXT DEFAULT "{}", + links TEXT DEFAULT "{}", + metadata TEXT DEFAULT "{}", + screenshot TEXT DEFAULT "" + ) + ''') + conn.commit() + conn.close() + +def alter_db_add_screenshot(new_column: str = "media"): + check_db_path() + try: + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + cursor.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') + conn.commit() + conn.close() + except Exception as e: + print(f"Error altering database to add screenshot column: {e}") + +def check_db_path(): + if not DB_PATH: + raise ValueError("Database path is not set or is empty.") + +def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]: + check_db_path() + try: + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,)) + result = cursor.fetchone() + conn.close() + return result + except Exception as e: + print(f"Error retrieving cached URL: {e}") + return None + +def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", metadata : str = "{}", screenshot: str = ""): + check_db_path() + try: + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + cursor.execute(''' + INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(url) DO UPDATE SET + html = excluded.html, + cleaned_html = excluded.cleaned_html, + markdown = excluded.markdown, + extracted_content = excluded.extracted_content, + success = excluded.success, + media = excluded.media, + links = excluded.links, + metadata = excluded.metadata, + screenshot = excluded.screenshot + ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)) + conn.commit() + conn.close() + except Exception as e: + print(f"Error caching URL: {e}") + +def get_total_count() -> int: + check_db_path() + try: + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + cursor.execute('SELECT COUNT(*) FROM crawled_data') + result = cursor.fetchone() + conn.close() + return result[0] + except Exception as e: + print(f"Error getting total count: {e}") + return 0 + +def clear_db(): + check_db_path() + try: + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + cursor.execute('DELETE FROM crawled_data') + conn.commit() + conn.close() + except Exception as e: + print(f"Error clearing database: {e}") + +def flush_db(): + check_db_path() + try: + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + cursor.execute('DROP TABLE crawled_data') + conn.commit() + conn.close() + except Exception as e: + print(f"Error flushing database: {e}") + +def update_existing_records(new_column: str = "media", default_value: str = "{}"): + check_db_path() + try: + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + cursor.execute(f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL') + conn.commit() + conn.close() + except Exception as e: + print(f"Error updating existing records: {e}") + +if __name__ == "__main__": + # Delete the existing database file + if os.path.exists(DB_PATH): + os.remove(DB_PATH) + init_db() + # alter_db_add_screenshot("COL_NAME") + diff --git a/crawl4ai/docs_manager.py b/crawl4ai/docs_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..aacc5812d15a6ed06d7d9b860cbb1e3781fbfcaf --- /dev/null +++ b/crawl4ai/docs_manager.py @@ -0,0 +1,67 @@ +import requests +import shutil +from pathlib import Path +from crawl4ai.async_logger import AsyncLogger +from crawl4ai.llmtxt import AsyncLLMTextManager + +class DocsManager: + def __init__(self, logger=None): + self.docs_dir = Path.home() / ".crawl4ai" / "docs" + self.local_docs = Path(__file__).parent.parent / "docs" / "llm.txt" + self.docs_dir.mkdir(parents=True, exist_ok=True) + self.logger = logger or AsyncLogger(verbose=True) + self.llm_text = AsyncLLMTextManager(self.docs_dir, self.logger) + + async def ensure_docs_exist(self): + """Fetch docs if not present""" + if not any(self.docs_dir.iterdir()): + await self.fetch_docs() + + async def fetch_docs(self) -> bool: + """Copy from local docs or download from GitHub""" + try: + # Try local first + if self.local_docs.exists() and (any(self.local_docs.glob("*.md")) or any(self.local_docs.glob("*.tokens"))): + # Empty the local docs directory + for file_path in self.docs_dir.glob("*.md"): + file_path.unlink() + # for file_path in self.docs_dir.glob("*.tokens"): + # file_path.unlink() + for file_path in self.local_docs.glob("*.md"): + shutil.copy2(file_path, self.docs_dir / file_path.name) + # for file_path in self.local_docs.glob("*.tokens"): + # shutil.copy2(file_path, self.docs_dir / file_path.name) + return True + + # Fallback to GitHub + response = requests.get( + "https://api.github.com/repos/unclecode/crawl4ai/contents/docs/llm.txt", + headers={'Accept': 'application/vnd.github.v3+json'} + ) + response.raise_for_status() + + for item in response.json(): + if item['type'] == 'file' and item['name'].endswith('.md'): + content = requests.get(item['download_url']).text + with open(self.docs_dir / item['name'], 'w', encoding='utf-8') as f: + f.write(content) + return True + + except Exception as e: + self.logger.error(f"Failed to fetch docs: {str(e)}") + raise + + def list(self) -> list[str]: + """List available topics""" + names = [file_path.stem for file_path in self.docs_dir.glob("*.md")] + # Remove [0-9]+_ prefix + names = [name.split("_", 1)[1] if name[0].isdigit() else name for name in names] + # Exclude those end with .xs.md and .q.md + names = [name for name in names if not name.endswith(".xs") and not name.endswith(".q")] + return names + + def generate(self, sections, mode="extended"): + return self.llm_text.generate(sections, mode) + + def search(self, query: str, top_k: int = 5): + return self.llm_text.search(query, top_k) \ No newline at end of file diff --git a/crawl4ai/extraction_strategy.bak.py b/crawl4ai/extraction_strategy.bak.py new file mode 100644 index 0000000000000000000000000000000000000000..2048c0ff460233efc9cc9a3f261d69389507d0eb --- /dev/null +++ b/crawl4ai/extraction_strategy.bak.py @@ -0,0 +1,1440 @@ +from abc import ABC, abstractmethod +from typing import Any, List, Dict, Optional, Union +from concurrent.futures import ThreadPoolExecutor, as_completed +import json, time +# from optimum.intel import IPEXModel +from .prompts import * +from .config import * +from .utils import * +from .models import * +from functools import partial +from .model_loader import * +import math +import numpy as np +import re +from bs4 import BeautifulSoup +from lxml import html, etree +from dataclasses import dataclass + +class ExtractionStrategy(ABC): + """ + Abstract base class for all extraction strategies. + """ + + def __init__(self, input_format: str = "markdown", **kwargs): + """ + Initialize the extraction strategy. + + Args: + input_format: Content format to use for extraction. + Options: "markdown" (default), "html", "fit_markdown" + **kwargs: Additional keyword arguments + """ + self.input_format = input_format + self.DEL = "<|DEL|>" + self.name = self.__class__.__name__ + self.verbose = kwargs.get("verbose", False) + + @abstractmethod + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML. + + :param url: The URL of the webpage. + :param html: The HTML content of the webpage. + :return: A list of extracted blocks or chunks. + """ + pass + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Process sections of text in parallel by default. + + :param url: The URL of the webpage. + :param sections: List of sections (strings) to process. + :return: A list of processed JSON blocks. + """ + extracted_content = [] + with ThreadPoolExecutor() as executor: + futures = [executor.submit(self.extract, url, section, **kwargs) for section in sections] + for future in as_completed(futures): + extracted_content.extend(future.result()) + return extracted_content + +class NoExtractionStrategy(ExtractionStrategy): + """ + A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block. + """ + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML. + """ + return [{"index": 0, "content": html}] + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)] + +####################################################### +# Strategies using LLM-based extraction for text data # +####################################################### +class LLMExtractionStrategy(ExtractionStrategy): + """ + A strategy that uses an LLM to extract meaningful content from the HTML. + + Attributes: + provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". + api_token: The API token for the provider. + instruction: The instruction to use for the LLM model. + schema: Pydantic model schema for structured data. + extraction_type: "block" or "schema". + chunk_token_threshold: Maximum tokens per chunk. + overlap_rate: Overlap between chunks. + word_token_rate: Word to token conversion rate. + apply_chunking: Whether to apply chunking. + base_url: The base URL for the API request. + api_base: The base URL for the API request. + extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + verbose: Whether to print verbose output. + usages: List of individual token usages. + total_usage: Accumulated token usage. + """ + + def __init__(self, + provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, + instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs): + """ + Initialize the strategy with clustering parameters. + + Args: + provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". + api_token: The API token for the provider. + instruction: The instruction to use for the LLM model. + schema: Pydantic model schema for structured data. + extraction_type: "block" or "schema". + chunk_token_threshold: Maximum tokens per chunk. + overlap_rate: Overlap between chunks. + word_token_rate: Word to token conversion rate. + apply_chunking: Whether to apply chunking. + base_url: The base URL for the API request. + api_base: The base URL for the API request. + extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + verbose: Whether to print verbose output. + usages: List of individual token usages. + total_usage: Accumulated token usage. + + """ + super().__init__(**kwargs) + self.provider = provider + self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY") + self.instruction = instruction + self.extract_type = extraction_type + self.schema = schema + if schema: + self.extract_type = "schema" + + self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD) + self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE) + self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) + self.apply_chunking = kwargs.get("apply_chunking", True) + self.base_url = kwargs.get("base_url", None) + self.api_base = kwargs.get("api_base", kwargs.get("base_url", None)) + self.extra_args = kwargs.get("extra_args", {}) + if not self.apply_chunking: + self.chunk_token_threshold = 1e9 + + self.verbose = kwargs.get("verbose", False) + self.usages = [] # Store individual usages + self.total_usage = TokenUsage() # Accumulated usage + + if not self.api_token: + raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.") + + + def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML using an LLM. + + How it works: + 1. Construct a prompt with variables. + 2. Make a request to the LLM using the prompt. + 3. Parse the response and extract blocks or chunks. + + Args: + url: The URL of the webpage. + ix: Index of the block. + html: The HTML content of the webpage. + + Returns: + A list of extracted blocks or chunks. + """ + if self.verbose: + # print("[LOG] Extracting blocks from URL:", url) + print(f"[LOG] Call LLM for {url} - block index: {ix}") + + variable_values = { + "URL": url, + "HTML": escape_json_string(sanitize_html(html)), + } + + prompt_with_variables = PROMPT_EXTRACT_BLOCKS + if self.instruction: + variable_values["REQUEST"] = self.instruction + prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION + + if self.extract_type == "schema" and self.schema: + variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) + prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION + + for variable in variable_values: + prompt_with_variables = prompt_with_variables.replace( + "{" + variable + "}", variable_values[variable] + ) + + response = perform_completion_with_backoff( + self.provider, + prompt_with_variables, + self.api_token, + base_url=self.api_base or self.base_url, + extra_args = self.extra_args + ) # , json_response=self.extract_type == "schema") + # Track usage + usage = TokenUsage( + completion_tokens=response.usage.completion_tokens, + prompt_tokens=response.usage.prompt_tokens, + total_tokens=response.usage.total_tokens, + completion_tokens_details=response.usage.completion_tokens_details.__dict__ if response.usage.completion_tokens_details else {}, + prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ if response.usage.prompt_tokens_details else {} + ) + self.usages.append(usage) + + # Update totals + self.total_usage.completion_tokens += usage.completion_tokens + self.total_usage.prompt_tokens += usage.prompt_tokens + self.total_usage.total_tokens += usage.total_tokens + + try: + blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] + blocks = json.loads(blocks) + for block in blocks: + block['error'] = False + except Exception as e: + parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content) + blocks = parsed + if unparsed: + blocks.append({ + "index": 0, + "error": True, + "tags": ["error"], + "content": unparsed + }) + + if self.verbose: + print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix) + return blocks + + def _merge(self, documents, chunk_token_threshold, overlap): + """ + Merge documents into sections based on chunk_token_threshold and overlap. + """ + chunks = [] + sections = [] + total_tokens = 0 + + # Calculate the total tokens across all documents + for document in documents: + total_tokens += len(document.split(' ')) * self.word_token_rate + + # Calculate the number of sections needed + num_sections = math.floor(total_tokens / chunk_token_threshold) + if num_sections < 1: + num_sections = 1 # Ensure there is at least one section + adjusted_chunk_threshold = total_tokens / num_sections + + total_token_so_far = 0 + current_chunk = [] + + for document in documents: + tokens = document.split(' ') + token_count = len(tokens) * self.word_token_rate + + if total_token_so_far + token_count <= adjusted_chunk_threshold: + current_chunk.extend(tokens) + total_token_so_far += token_count + else: + # Ensure to handle the last section properly + if len(sections) == num_sections - 1: + current_chunk.extend(tokens) + continue + + # Add overlap if specified + if overlap > 0 and current_chunk: + overlap_tokens = current_chunk[-overlap:] + current_chunk.extend(overlap_tokens) + + sections.append(' '.join(current_chunk)) + current_chunk = tokens + total_token_so_far = token_count + + # Add the last chunk + if current_chunk: + sections.append(' '.join(current_chunk)) + + return sections + + + def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]: + """ + Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy. + + Args: + url: The URL of the webpage. + sections: List of sections (strings) to process. + + Returns: + A list of extracted blocks or chunks. + """ + + merged_sections = self._merge( + sections, self.chunk_token_threshold, + overlap= int(self.chunk_token_threshold * self.overlap_rate) + ) + extracted_content = [] + if self.provider.startswith("groq/"): + # Sequential processing with a delay + for ix, section in enumerate(merged_sections): + extract_func = partial(self.extract, url) + extracted_content.extend(extract_func(ix, sanitize_input_encode(section))) + time.sleep(0.5) # 500 ms delay between each processing + else: + # Parallel processing using ThreadPoolExecutor + # extract_func = partial(self.extract, url) + # for ix, section in enumerate(merged_sections): + # extracted_content.append(extract_func(ix, section)) + + with ThreadPoolExecutor(max_workers=4) as executor: + extract_func = partial(self.extract, url) + futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)] + + for future in as_completed(futures): + try: + extracted_content.extend(future.result()) + except Exception as e: + if self.verbose: + print(f"Error in thread execution: {e}") + # Add error information to extracted_content + extracted_content.append({ + "index": 0, + "error": True, + "tags": ["error"], + "content": str(e) + }) + + + return extracted_content + + + def show_usage(self) -> None: + """Print a detailed token usage report showing total and per-request usage.""" + print("\n=== Token Usage Summary ===") + print(f"{'Type':<15} {'Count':>12}") + print("-" * 30) + print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}") + print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}") + print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}") + + print("\n=== Usage History ===") + print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}") + print("-" * 48) + for i, usage in enumerate(self.usages, 1): + print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}") + +####################################################### +# Strategies using clustering for text data extraction # +####################################################### + +class CosineStrategy(ExtractionStrategy): + """ + Extract meaningful blocks or chunks from the given HTML using cosine similarity. + + How it works: + 1. Pre-filter documents using embeddings and semantic_filter. + 2. Perform clustering using cosine similarity. + 3. Organize texts by their cluster labels, retaining order. + 4. Filter clusters by word count. + 5. Extract meaningful blocks or chunks from the filtered clusters. + + Attributes: + semantic_filter (str): A keyword filter for document filtering. + word_count_threshold (int): Minimum number of words per cluster. + max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters. + linkage_method (str): The linkage method for hierarchical clustering. + top_k (int): Number of top categories to extract. + model_name (str): The name of the sentence-transformers model. + sim_threshold (float): The similarity threshold for clustering. + """ + def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs): + """ + Initialize the strategy with clustering parameters. + + Args: + semantic_filter (str): A keyword filter for document filtering. + word_count_threshold (int): Minimum number of words per cluster. + max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters. + linkage_method (str): The linkage method for hierarchical clustering. + top_k (int): Number of top categories to extract. + """ + super().__init__(**kwargs) + + import numpy as np + + self.semantic_filter = semantic_filter + self.word_count_threshold = word_count_threshold + self.max_dist = max_dist + self.linkage_method = linkage_method + self.top_k = top_k + self.sim_threshold = sim_threshold + self.timer = time.time() + self.verbose = kwargs.get("verbose", False) + + self.buffer_embeddings = np.array([]) + self.get_embedding_method = "direct" + + self.device = get_device() + # import torch + # self.device = torch.device('cpu') + + self.default_batch_size = calculate_batch_size(self.device) + + if self.verbose: + print(f"[LOG] Loading Extraction Model for {self.device.type} device.") + + # if False and self.device.type == "cpu": + # self.model = load_onnx_all_MiniLM_l6_v2() + # self.tokenizer = self.model.tokenizer + # self.get_embedding_method = "direct" + # else: + + self.tokenizer, self.model = load_HF_embedding_model(model_name) + self.model.to(self.device) + self.model.eval() + + self.get_embedding_method = "batch" + + self.buffer_embeddings = np.array([]) + + # if model_name == "bert-base-uncased": + # self.tokenizer, self.model = load_bert_base_uncased() + # self.model.eval() # Ensure the model is in evaluation mode + # self.get_embedding_method = "batch" + # elif model_name == "BAAI/bge-small-en-v1.5": + # self.tokenizer, self.model = load_bge_small_en_v1_5() + # self.model.eval() # Ensure the model is in evaluation mode + # self.get_embedding_method = "batch" + # elif model_name == "sentence-transformers/all-MiniLM-L6-v2": + # self.model = load_onnx_all_MiniLM_l6_v2() + # self.tokenizer = self.model.tokenizer + # self.get_embedding_method = "direct" + + + if self.verbose: + print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.") + + self.nlp, _ = load_text_multilabel_classifier() + # self.default_batch_size = 16 if self.device.type == 'cpu' else 64 + + if self.verbose: + print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds") + + def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, at_least_k: int = 20) -> List[str]: + """ + Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding. + + Args: + documents (List[str]): A list of document texts. + semantic_filter (str): A keyword filter for document filtering. + at_least_k (int): The minimum number of documents to return. + + Returns: + List[str]: A list of filtered and sorted document texts. + """ + + if not semantic_filter: + return documents + + if len(documents) < at_least_k: + at_least_k = len(documents) // 2 + + from sklearn.metrics.pairwise import cosine_similarity + + # Compute embedding for the keyword filter + query_embedding = self.get_embeddings([semantic_filter])[0] + + # Compute embeddings for the documents + document_embeddings = self.get_embeddings(documents) + + # Calculate cosine similarity between the query embedding and document embeddings + similarities = cosine_similarity([query_embedding], document_embeddings).flatten() + + # Filter documents based on the similarity threshold + filtered_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim >= self.sim_threshold] + + # If the number of filtered documents is less than at_least_k, sort remaining documents by similarity + if len(filtered_docs) < at_least_k: + remaining_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim < self.sim_threshold] + remaining_docs.sort(key=lambda x: x[1], reverse=True) + filtered_docs.extend(remaining_docs[:at_least_k - len(filtered_docs)]) + + # Extract the document texts from the tuples + filtered_docs = [doc for doc, _ in filtered_docs] + + return filtered_docs[:at_least_k] + + def get_embeddings(self, sentences: List[str], batch_size=None, bypass_buffer=False): + """ + Get BERT embeddings for a list of sentences. + + Args: + sentences (List[str]): A list of text chunks (sentences). + + Returns: + NumPy array of embeddings. + """ + # if self.buffer_embeddings.any() and not bypass_buffer: + # return self.buffer_embeddings + + if self.device.type in [ "cpu", "gpu", "cuda", "mps"]: + import torch + # Tokenize sentences and convert to tensor + if batch_size is None: + batch_size = self.default_batch_size + + all_embeddings = [] + for i in range(0, len(sentences), batch_size): + batch_sentences = sentences[i:i + batch_size] + encoded_input = self.tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt') + encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()} + + # Ensure no gradients are calculated + with torch.no_grad(): + model_output = self.model(**encoded_input) + + # Get embeddings from the last hidden state (mean pooling) + embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy() + all_embeddings.append(embeddings) + + self.buffer_embeddings = np.vstack(all_embeddings) + elif self.device.type == "cpu": + # self.buffer_embeddings = self.model(sentences) + if batch_size is None: + batch_size = self.default_batch_size + + all_embeddings = [] + for i in range(0, len(sentences), batch_size): + batch_sentences = sentences[i:i + batch_size] + embeddings = self.model(batch_sentences) + all_embeddings.append(embeddings) + + self.buffer_embeddings = np.vstack(all_embeddings) + return self.buffer_embeddings + + def hierarchical_clustering(self, sentences: List[str], embeddings = None): + """ + Perform hierarchical clustering on sentences and return cluster labels. + + Args: + sentences (List[str]): A list of text chunks (sentences). + + Returns: + NumPy array of cluster labels. + """ + # Get embeddings + from scipy.cluster.hierarchy import linkage, fcluster + from scipy.spatial.distance import pdist + self.timer = time.time() + embeddings = self.get_embeddings(sentences, bypass_buffer=True) + # print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds") + # Compute pairwise cosine distances + distance_matrix = pdist(embeddings, 'cosine') + # Perform agglomerative clustering respecting order + linked = linkage(distance_matrix, method=self.linkage_method) + # Form flat clusters + labels = fcluster(linked, self.max_dist, criterion='distance') + return labels + + def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]: + """ + Filter clusters to remove those with a word count below the threshold. + + Args: + clusters (Dict[int, List[str]]): Dictionary of clusters. + + Returns: + Dict[int, List[str]]: Filtered dictionary of clusters. + """ + filtered_clusters = {} + for cluster_id, texts in clusters.items(): + # Concatenate texts for analysis + full_text = " ".join(texts) + # Count words + word_count = len(full_text.split()) + + # Keep clusters with word count above the threshold + if word_count >= self.word_count_threshold: + filtered_clusters[cluster_id] = texts + + return filtered_clusters + + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract clusters from HTML content using hierarchical clustering. + + Args: + url (str): The URL of the webpage. + html (str): The HTML content of the webpage. + + Returns: + List[Dict[str, Any]]: A list of processed JSON blocks. + """ + # Assume `html` is a list of text chunks for this strategy + t = time.time() + text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed + + # Pre-filter documents using embeddings and semantic_filter + text_chunks = self.filter_documents_embeddings(text_chunks, self.semantic_filter) + + if not text_chunks: + return [] + + # Perform clustering + labels = self.hierarchical_clustering(text_chunks) + # print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds") + + # Organize texts by their cluster labels, retaining order + t = time.time() + clusters = {} + for index, label in enumerate(labels): + clusters.setdefault(label, []).append(text_chunks[index]) + + # Filter clusters by word count + filtered_clusters = self.filter_clusters_by_word_count(clusters) + + # Convert filtered clusters to a sorted list of dictionaries + cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)] + + if self.verbose: + print(f"[LOG] 🚀 Assign tags using {self.device}") + + if self.device.type in ["gpu", "cuda", "mps", "cpu"]: + labels = self.nlp([cluster['content'] for cluster in cluster_list]) + + for cluster, label in zip(cluster_list, labels): + cluster['tags'] = label + # elif self.device.type == "cpu": + # # Process the text with the loaded model + # texts = [cluster['content'] for cluster in cluster_list] + # # Batch process texts + # docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"]) + + # for doc, cluster in zip(docs, cluster_list): + # tok_k = self.top_k + # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] + # cluster['tags'] = [cat for cat, _ in top_categories] + + # for cluster in cluster_list: + # doc = self.nlp(cluster['content']) + # tok_k = self.top_k + # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] + # cluster['tags'] = [cat for cat, _ in top_categories] + + if self.verbose: + print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds") + + return cluster_list + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Process sections using hierarchical clustering. + + Args: + url (str): The URL of the webpage. + sections (List[str]): List of sections (strings) to process. + + Returns: + """ + # This strategy processes all sections together + + return self.extract(url, self.DEL.join(sections), **kwargs) + +####################################################### +# New extraction strategies for JSON-based extraction # +####################################################### + +class JsonElementExtractionStrategy(ExtractionStrategy): + """ + Abstract base class for extracting structured JSON from HTML content. + + How it works: + 1. Parses HTML content using the `_parse_html` method. + 2. Uses a schema to define base selectors, fields, and transformations. + 3. Extracts data hierarchically, supporting nested fields and lists. + 4. Handles computed fields with expressions or functions. + + Attributes: + DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'. + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content. + _extract_item(element, fields): Extracts fields from a single element. + _extract_single_field(element, field): Extracts a single field based on its type. + _apply_transform(value, transform): Applies a transformation to a value. + _compute_field(item, field): Computes a field value using an expression or function. + run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy. + + Abstract Methods: + _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml). + _get_base_elements(parsed_html, selector): Retrieves base elements using a selector. + _get_elements(element, selector): Retrieves child elements using a selector. + _get_element_text(element): Extracts text content from an element. + _get_element_html(element): Extracts raw HTML from an element. + _get_element_attribute(element, attribute): Extracts an attribute's value from an element. + """ + + + DEL = '\n' + + def __init__(self, schema: Dict[str, Any], **kwargs): + """ + Initialize the JSON element extraction strategy with a schema. + + Args: + schema (Dict[str, Any]): The schema defining the extraction rules. + """ + super().__init__(**kwargs) + self.schema = schema + self.verbose = kwargs.get('verbose', False) + + def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract structured data from HTML content. + + How it works: + 1. Parses the HTML content using the `_parse_html` method. + 2. Identifies base elements using the schema's base selector. + 3. Extracts fields from each base element using `_extract_item`. + + Args: + url (str): The URL of the page being processed. + html_content (str): The raw HTML content to parse and extract. + *q: Additional positional arguments. + **kwargs: Additional keyword arguments for custom extraction. + + Returns: + List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary. + """ + + parsed_html = self._parse_html(html_content) + base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector']) + + results = [] + for element in base_elements: + # Extract base element attributes + item = {} + if 'baseFields' in self.schema: + for field in self.schema['baseFields']: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + + # Extract child fields + field_data = self._extract_item(element, self.schema['fields']) + item.update(field_data) + + if item: + results.append(item) + + return results + + @abstractmethod + def _parse_html(self, html_content: str): + """Parse HTML content into appropriate format""" + pass + + @abstractmethod + def _get_base_elements(self, parsed_html, selector: str): + """Get all base elements using the selector""" + pass + + @abstractmethod + def _get_elements(self, element, selector: str): + """Get child elements using the selector""" + pass + + def _extract_field(self, element, field): + try: + if field['type'] == 'nested': + nested_elements = self._get_elements(element, field['selector']) + nested_element = nested_elements[0] if nested_elements else None + return self._extract_item(nested_element, field['fields']) if nested_element else {} + + if field['type'] == 'list': + elements = self._get_elements(element, field['selector']) + return [self._extract_list_item(el, field['fields']) for el in elements] + + if field['type'] == 'nested_list': + elements = self._get_elements(element, field['selector']) + return [self._extract_item(el, field['fields']) for el in elements] + + return self._extract_single_field(element, field) + except Exception as e: + if self.verbose: + print(f"Error extracting field {field['name']}: {str(e)}") + return field.get('default') + + def _extract_single_field(self, element, field): + """ + Extract a single field based on its type. + + How it works: + 1. Selects the target element using the field's selector. + 2. Extracts the field value based on its type (e.g., text, attribute, regex). + 3. Applies transformations if defined in the schema. + + Args: + element: The base element to extract the field from. + field (Dict[str, Any]): The field definition in the schema. + + Returns: + Any: The extracted field value. + """ + + if 'selector' in field: + selected = self._get_elements(element, field['selector']) + if not selected: + return field.get('default') + selected = selected[0] + else: + selected = element + + value = None + if field['type'] == 'text': + value = self._get_element_text(selected) + elif field['type'] == 'attribute': + value = self._get_element_attribute(selected, field['attribute']) + elif field['type'] == 'html': + value = self._get_element_html(selected) + elif field['type'] == 'regex': + text = self._get_element_text(selected) + match = re.search(field['pattern'], text) + value = match.group(1) if match else None + + if 'transform' in field: + value = self._apply_transform(value, field['transform']) + + return value if value is not None else field.get('default') + + def _extract_list_item(self, element, fields): + item = {} + for field in fields: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _extract_item(self, element, fields): + """ + Extracts fields from a given element. + + How it works: + 1. Iterates through the fields defined in the schema. + 2. Handles computed, single, and nested field types. + 3. Updates the item dictionary with extracted field values. + + Args: + element: The base element to extract fields from. + fields (List[Dict[str, Any]]): The list of fields to extract. + + Returns: + Dict[str, Any]: A dictionary representing the extracted item. + """ + + item = {} + for field in fields: + if field['type'] == 'computed': + value = self._compute_field(item, field) + else: + value = self._extract_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _apply_transform(self, value, transform): + """ + Apply a transformation to a value. + + How it works: + 1. Checks the transformation type (e.g., `lowercase`, `strip`). + 2. Applies the transformation to the value. + 3. Returns the transformed value. + + Args: + value (str): The value to transform. + transform (str): The type of transformation to apply. + + Returns: + str: The transformed value. + """ + + if transform == 'lowercase': + return value.lower() + elif transform == 'uppercase': + return value.upper() + elif transform == 'strip': + return value.strip() + return value + + def _compute_field(self, item, field): + try: + if 'expression' in field: + return eval(field['expression'], {}, item) + elif 'function' in field: + return field['function'](item) + except Exception as e: + if self.verbose: + print(f"Error computing field {field['name']}: {str(e)}") + return field.get('default') + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Run the extraction strategy on a combined HTML content. + + How it works: + 1. Combines multiple HTML sections using the `DEL` delimiter. + 2. Calls the `extract` method with the combined HTML. + + Args: + url (str): The URL of the page being processed. + sections (List[str]): A list of HTML sections. + *q: Additional positional arguments. + **kwargs: Additional keyword arguments for custom extraction. + + Returns: + List[Dict[str, Any]]: A list of extracted items. + """ + + combined_html = self.DEL.join(sections) + return self.extract(url, combined_html, **kwargs) + + @abstractmethod + def _get_element_text(self, element) -> str: + """Get text content from element""" + pass + + @abstractmethod + def _get_element_html(self, element) -> str: + """Get HTML content from element""" + pass + + @abstractmethod + def _get_element_attribute(self, element, attribute: str): + """Get attribute value from element""" + pass + +class JsonCssExtractionStrategy(JsonElementExtractionStrategy): + """ + Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors. + + How it works: + 1. Parses HTML content with BeautifulSoup. + 2. Selects elements using CSS selectors defined in the schema. + 3. Extracts field data and applies transformations as defined. + + Attributes: + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + _parse_html(html_content): Parses HTML content into a BeautifulSoup object. + _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector. + _get_elements(element, selector): Selects child elements using a CSS selector. + _get_element_text(element): Extracts text content from a BeautifulSoup element. + _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element. + _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element. + """ + + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(schema, **kwargs) + + def _parse_html(self, html_content: str): + return BeautifulSoup(html_content, 'html.parser') + + def _get_base_elements(self, parsed_html, selector: str): + return parsed_html.select(selector) + + def _get_elements(self, element, selector: str): + selected = element.select_one(selector) + return [selected] if selected else [] + + def _get_element_text(self, element) -> str: + return element.get_text(strip=True) + + def _get_element_html(self, element) -> str: + return str(element) + + def _get_element_attribute(self, element, attribute: str): + return element.get(attribute) + +class JsonXPathExtractionStrategy(JsonElementExtractionStrategy): + """ + Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors. + + How it works: + 1. Parses HTML content into an lxml tree. + 2. Selects elements using XPath expressions. + 3. Converts CSS selectors to XPath when needed. + + Attributes: + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + _parse_html(html_content): Parses HTML content into an lxml tree. + _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector. + _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression. + _get_elements(element, selector): Selects child elements using an XPath selector. + _get_element_text(element): Extracts text content from an lxml element. + _get_element_html(element): Extracts the raw HTML content of an lxml element. + _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element. + """ + + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(schema, **kwargs) + + def _parse_html(self, html_content: str): + return html.fromstring(html_content) + + def _get_base_elements(self, parsed_html, selector: str): + return parsed_html.xpath(selector) + + def _css_to_xpath(self, css_selector: str) -> str: + """Convert CSS selector to XPath if needed""" + if '/' in css_selector: # Already an XPath + return css_selector + return self._basic_css_to_xpath(css_selector) + + def _basic_css_to_xpath(self, css_selector: str) -> str: + """Basic CSS to XPath conversion for common cases""" + if ' > ' in css_selector: + parts = css_selector.split(' > ') + return '//' + '/'.join(parts) + if ' ' in css_selector: + parts = css_selector.split(' ') + return '//' + '//'.join(parts) + return '//' + css_selector + + def _get_elements(self, element, selector: str): + xpath = self._css_to_xpath(selector) + if not xpath.startswith('.'): + xpath = '.' + xpath + return element.xpath(xpath) + + def _get_element_text(self, element) -> str: + return ''.join(element.xpath('.//text()')).strip() + + def _get_element_html(self, element) -> str: + return etree.tostring(element, encoding='unicode') + + def _get_element_attribute(self, element, attribute: str): + return element.get(attribute) + + +####################################################### +# Strategies based on the extraction of specific types# +####################################################### + +class TopicExtractionStrategy(ExtractionStrategy): + def __init__(self, num_keywords: int = 3, **kwargs): + """ + Initialize the topic extraction strategy with parameters for topic segmentation. + + :param num_keywords: Number of keywords to represent each topic segment. + """ + import nltk + super().__init__(**kwargs) + self.num_keywords = num_keywords + self.tokenizer = nltk.TextTilingTokenizer() + + def extract_keywords(self, text: str) -> List[str]: + """ + Extract keywords from a given text segment using simple frequency analysis. + + :param text: The text segment from which to extract keywords. + :return: A list of keyword strings. + """ + import nltk + # Tokenize the text and compute word frequency + words = nltk.word_tokenize(text) + freq_dist = nltk.FreqDist(words) + # Get the most common words as keywords + keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)] + return keywords + + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract topics from HTML content using TextTiling for segmentation and keyword extraction. + + :param url: The URL of the webpage. + :param html: The HTML content of the webpage. + :param provider: The provider to be used for extraction (not used here). + :param api_token: Optional API token for the provider (not used here). + :return: A list of dictionaries representing the topics. + """ + # Use TextTiling to segment the text into topics + segmented_topics = html.split(self.DEL) # Split by lines or paragraphs as needed + + # Prepare the output as a list of dictionaries + topic_list = [] + for i, segment in enumerate(segmented_topics): + # Extract keywords for each segment + keywords = self.extract_keywords(segment) + topic_list.append({ + "index": i, + "content": segment, + "keywords": keywords + }) + + return topic_list + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Process sections using topic segmentation and keyword extraction. + + :param url: The URL of the webpage. + :param sections: List of sections (strings) to process. + :param provider: The provider to be used for extraction (not used here). + :param api_token: Optional API token for the provider (not used here). + :return: A list of processed JSON blocks. + """ + # Concatenate sections into a single text for coherent topic segmentation + + + return self.extract(url, self.DEL.join(sections), **kwargs) + +class ContentSummarizationStrategy(ExtractionStrategy): + def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6", **kwargs): + """ + Initialize the content summarization strategy with a specific model. + + :param model_name: The model to use for summarization. + """ + super().__init__(**kwargs) + from transformers import pipeline + self.summarizer = pipeline("summarization", model=model_name) + + def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]: + """ + Summarize a single section of text. + + :param url: The URL of the webpage. + :param text: A section of text to summarize. + :param provider: The provider to be used for extraction (not used here). + :param api_token: Optional API token for the provider (not used here). + :return: A dictionary with the summary. + """ + try: + summary = self.summarizer(text, max_length=130, min_length=30, do_sample=False) + return {"summary": summary[0]['summary_text']} + except Exception as e: + print(f"Error summarizing text: {e}") + return {"summary": text} # Fallback to original text if summarization fails + + def run(self, url: str, sections: List[str], provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]: + """ + Process each section in parallel to produce summaries. + + :param url: The URL of the webpage. + :param sections: List of sections (strings) to summarize. + :param provider: The provider to be used for extraction (not used here). + :param api_token: Optional API token for the provider (not used here). + :return: A list of dictionaries with summaries for each section. + """ + # Use a ThreadPoolExecutor to summarize in parallel + summaries = [] + with ThreadPoolExecutor() as executor: + # Create a future for each section's summarization + future_to_section = {executor.submit(self.extract, url, section, provider, api_token): i for i, section in enumerate(sections)} + for future in as_completed(future_to_section): + section_index = future_to_section[future] + try: + summary_result = future.result() + summaries.append((section_index, summary_result)) + except Exception as e: + print(f"Error processing section {section_index}: {e}") + summaries.append((section_index, {"summary": sections[section_index]})) # Fallback to original text + + # Sort summaries by the original section index to maintain order + summaries.sort(key=lambda x: x[0]) + return [summary for _, summary in summaries] + +####################################################### +# Deprecated strategies +####################################################### + +class _JsonCssExtractionStrategy(ExtractionStrategy): + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(**kwargs) + self.schema = schema + + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + soup = BeautifulSoup(html, 'html.parser') + base_elements = soup.select(self.schema['baseSelector']) + + results = [] + for element in base_elements: + # Extract base element attributes first + item = {} + if 'baseFields' in self.schema: + for field in self.schema['baseFields']: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + + # Then extract child fields + field_data = self._extract_item(element, self.schema['fields']) + item.update(field_data) + + results.append(item) + + return results + + def _extract_field(self, element, field): + try: + if field['type'] == 'nested': + nested_element = element.select_one(field['selector']) + return self._extract_item(nested_element, field['fields']) if nested_element else {} + + if field['type'] == 'list': + elements = element.select(field['selector']) + return [self._extract_list_item(el, field['fields']) for el in elements] + + if field['type'] == 'nested_list': + elements = element.select(field['selector']) + return [self._extract_item(el, field['fields']) for el in elements] + + return self._extract_single_field(element, field) + except Exception as e: + if self.verbose: + print(f"Error extracting field {field['name']}: {str(e)}") + return field.get('default') + + def _extract_list_item(self, element, fields): + item = {} + for field in fields: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _extract_single_field(self, element, field): + if 'selector' in field: + selected = element.select_one(field['selector']) + if not selected: + return field.get('default') + else: + selected = element + + value = None + if field['type'] == 'text': + value = selected.get_text(strip=True) + elif field['type'] == 'attribute': + value = selected.get(field['attribute']) + elif field['type'] == 'html': + value = str(selected) + elif field['type'] == 'regex': + text = selected.get_text(strip=True) + match = re.search(field['pattern'], text) + value = match.group(1) if match else None + + if 'transform' in field: + value = self._apply_transform(value, field['transform']) + + return value if value is not None else field.get('default') + + def _extract_item(self, element, fields): + item = {} + for field in fields: + if field['type'] == 'computed': + value = self._compute_field(item, field) + else: + value = self._extract_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _apply_transform(self, value, transform): + if transform == 'lowercase': + return value.lower() + elif transform == 'uppercase': + return value.upper() + elif transform == 'strip': + return value.strip() + return value + + def _compute_field(self, item, field): + try: + if 'expression' in field: + return eval(field['expression'], {}, item) + elif 'function' in field: + return field['function'](item) + except Exception as e: + if self.verbose: + print(f"Error computing field {field['name']}: {str(e)}") + return field.get('default') + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + combined_html = self.DEL.join(sections) + return self.extract(url, combined_html, **kwargs) +class _JsonXPathExtractionStrategy(ExtractionStrategy): + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(**kwargs) + self.schema = schema + + def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]: + tree = html.fromstring(html_content) + base_xpath = self.schema['baseSelector'] + base_elements = tree.xpath(base_xpath) + + results = [] + for element in base_elements: + # Extract base element attributes first + item = {} + if 'baseFields' in self.schema: + for field in self.schema['baseFields']: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + + # Then extract child fields + field_data = self._extract_item(element, self.schema['fields']) + item.update(field_data) + + results.append(item) + + return results + + def _css_to_xpath(self, css_selector: str) -> str: + """Convert CSS selector to XPath if needed""" + if '/' in css_selector: # Already an XPath + return css_selector + else: + # Fallback to basic conversion for common cases + return self._basic_css_to_xpath(css_selector) + + def _basic_css_to_xpath(self, css_selector: str) -> str: + """Basic CSS to XPath conversion for common cases""" + # Handle basic cases + if ' > ' in css_selector: + parts = css_selector.split(' > ') + return '//' + '/'.join(parts) + if ' ' in css_selector: + parts = css_selector.split(' ') + return '//' + '//'.join(parts) + return '//' + css_selector + + def _extract_field(self, element, field): + try: + if field['type'] == 'nested': + xpath = self._css_to_xpath(field['selector']) + nested_element = element.xpath(xpath)[0] if element.xpath(xpath) else None + return self._extract_item(nested_element, field['fields']) if nested_element is not None else {} + + if field['type'] == 'list': + xpath = self._css_to_xpath(field['selector']) + elements = element.xpath(xpath) + return [self._extract_list_item(el, field['fields']) for el in elements] + + if field['type'] == 'nested_list': + xpath = self._css_to_xpath(field['selector']) + elements = element.xpath(xpath) + return [self._extract_item(el, field['fields']) for el in elements] + + return self._extract_single_field(element, field) + except Exception as e: + if self.verbose: + print(f"Error extracting field {field['name']}: {str(e)}") + return field.get('default') + + def _extract_list_item(self, element, fields): + item = {} + for field in fields: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _extract_single_field(self, element, field): + if 'selector' in field: + xpath = self._css_to_xpath(field['selector']) + selected = element.xpath(xpath) + if not selected: + return field.get('default') + selected = selected[0] + else: + selected = element + + value = None + if field['type'] == 'text': + value = ''.join(selected.xpath('.//text()')).strip() + elif field['type'] == 'attribute': + value = selected.get(field['attribute']) + elif field['type'] == 'html': + value = etree.tostring(selected, encoding='unicode') + elif field['type'] == 'regex': + text = ''.join(selected.xpath('.//text()')).strip() + match = re.search(field['pattern'], text) + value = match.group(1) if match else None + + if 'transform' in field: + value = self._apply_transform(value, field['transform']) + + return value if value is not None else field.get('default') + + def _extract_item(self, element, fields): + item = {} + for field in fields: + if field['type'] == 'computed': + value = self._compute_field(item, field) + else: + value = self._extract_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _apply_transform(self, value, transform): + if transform == 'lowercase': + return value.lower() + elif transform == 'uppercase': + return value.upper() + elif transform == 'strip': + return value.strip() + return value + + def _compute_field(self, item, field): + try: + if 'expression' in field: + return eval(field['expression'], {}, item) + elif 'function' in field: + return field['function'](item) + except Exception as e: + if self.verbose: + print(f"Error computing field {field['name']}: {str(e)}") + return field.get('default') + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + combined_html = self.DEL.join(sections) + return self.extract(url, combined_html, **kwargs) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py new file mode 100644 index 0000000000000000000000000000000000000000..3e688f13b660ce42d4300a726b8fd31fde573d7c --- /dev/null +++ b/crawl4ai/extraction_strategy.py @@ -0,0 +1,1052 @@ +from abc import ABC, abstractmethod +from typing import Any, List, Dict, Optional, Union +from concurrent.futures import ThreadPoolExecutor, as_completed +import json, time +# from optimum.intel import IPEXModel +from .prompts import * +from .config import * +from .utils import * +from .models import * +from functools import partial +from .model_loader import * +import math +import numpy as np +import re +from bs4 import BeautifulSoup +from lxml import html, etree +from dataclasses import dataclass + +class ExtractionStrategy(ABC): + """ + Abstract base class for all extraction strategies. + """ + + def __init__(self, input_format: str = "markdown", **kwargs): + """ + Initialize the extraction strategy. + + Args: + input_format: Content format to use for extraction. + Options: "markdown" (default), "html", "fit_markdown" + **kwargs: Additional keyword arguments + """ + self.input_format = input_format + self.DEL = "<|DEL|>" + self.name = self.__class__.__name__ + self.verbose = kwargs.get("verbose", False) + + @abstractmethod + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML. + + :param url: The URL of the webpage. + :param html: The HTML content of the webpage. + :return: A list of extracted blocks or chunks. + """ + pass + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Process sections of text in parallel by default. + + :param url: The URL of the webpage. + :param sections: List of sections (strings) to process. + :return: A list of processed JSON blocks. + """ + extracted_content = [] + with ThreadPoolExecutor() as executor: + futures = [executor.submit(self.extract, url, section, **kwargs) for section in sections] + for future in as_completed(futures): + extracted_content.extend(future.result()) + return extracted_content + +class NoExtractionStrategy(ExtractionStrategy): + """ + A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block. + """ + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML. + """ + return [{"index": 0, "content": html}] + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)] + +####################################################### +# Strategies using LLM-based extraction for text data # +####################################################### +class LLMExtractionStrategy(ExtractionStrategy): + """ + A strategy that uses an LLM to extract meaningful content from the HTML. + + Attributes: + provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". + api_token: The API token for the provider. + instruction: The instruction to use for the LLM model. + schema: Pydantic model schema for structured data. + extraction_type: "block" or "schema". + chunk_token_threshold: Maximum tokens per chunk. + overlap_rate: Overlap between chunks. + word_token_rate: Word to token conversion rate. + apply_chunking: Whether to apply chunking. + base_url: The base URL for the API request. + api_base: The base URL for the API request. + extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + verbose: Whether to print verbose output. + usages: List of individual token usages. + total_usage: Accumulated token usage. + """ + + def __init__(self, + provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, + instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs): + """ + Initialize the strategy with clustering parameters. + + Args: + provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". + api_token: The API token for the provider. + instruction: The instruction to use for the LLM model. + schema: Pydantic model schema for structured data. + extraction_type: "block" or "schema". + chunk_token_threshold: Maximum tokens per chunk. + overlap_rate: Overlap between chunks. + word_token_rate: Word to token conversion rate. + apply_chunking: Whether to apply chunking. + base_url: The base URL for the API request. + api_base: The base URL for the API request. + extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + verbose: Whether to print verbose output. + usages: List of individual token usages. + total_usage: Accumulated token usage. + + """ + super().__init__(**kwargs) + self.provider = provider + self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY") + self.instruction = instruction + self.extract_type = extraction_type + self.schema = schema + if schema: + self.extract_type = "schema" + + self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD) + self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE) + self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) + self.apply_chunking = kwargs.get("apply_chunking", True) + self.base_url = kwargs.get("base_url", None) + self.api_base = kwargs.get("api_base", kwargs.get("base_url", None)) + self.extra_args = kwargs.get("extra_args", {}) + if not self.apply_chunking: + self.chunk_token_threshold = 1e9 + + self.verbose = kwargs.get("verbose", False) + self.usages = [] # Store individual usages + self.total_usage = TokenUsage() # Accumulated usage + + if not self.api_token: + raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.") + + + def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML using an LLM. + + How it works: + 1. Construct a prompt with variables. + 2. Make a request to the LLM using the prompt. + 3. Parse the response and extract blocks or chunks. + + Args: + url: The URL of the webpage. + ix: Index of the block. + html: The HTML content of the webpage. + + Returns: + A list of extracted blocks or chunks. + """ + if self.verbose: + # print("[LOG] Extracting blocks from URL:", url) + print(f"[LOG] Call LLM for {url} - block index: {ix}") + + variable_values = { + "URL": url, + "HTML": escape_json_string(sanitize_html(html)), + } + + prompt_with_variables = PROMPT_EXTRACT_BLOCKS + if self.instruction: + variable_values["REQUEST"] = self.instruction + prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION + + if self.extract_type == "schema" and self.schema: + variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) + prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION + + for variable in variable_values: + prompt_with_variables = prompt_with_variables.replace( + "{" + variable + "}", variable_values[variable] + ) + + response = perform_completion_with_backoff( + self.provider, + prompt_with_variables, + self.api_token, + base_url=self.api_base or self.base_url, + extra_args = self.extra_args + ) # , json_response=self.extract_type == "schema") + # Track usage + usage = TokenUsage( + completion_tokens=response.usage.completion_tokens, + prompt_tokens=response.usage.prompt_tokens, + total_tokens=response.usage.total_tokens, + completion_tokens_details=response.usage.completion_tokens_details.__dict__ if response.usage.completion_tokens_details else {}, + prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ if response.usage.prompt_tokens_details else {} + ) + self.usages.append(usage) + + # Update totals + self.total_usage.completion_tokens += usage.completion_tokens + self.total_usage.prompt_tokens += usage.prompt_tokens + self.total_usage.total_tokens += usage.total_tokens + + try: + blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] + blocks = json.loads(blocks) + for block in blocks: + block['error'] = False + except Exception as e: + parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content) + blocks = parsed + if unparsed: + blocks.append({ + "index": 0, + "error": True, + "tags": ["error"], + "content": unparsed + }) + + if self.verbose: + print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix) + return blocks + + def _merge(self, documents, chunk_token_threshold, overlap): + """ + Merge documents into sections based on chunk_token_threshold and overlap. + """ + chunks = [] + sections = [] + total_tokens = 0 + + # Calculate the total tokens across all documents + for document in documents: + total_tokens += len(document.split(' ')) * self.word_token_rate + + # Calculate the number of sections needed + num_sections = math.floor(total_tokens / chunk_token_threshold) + if num_sections < 1: + num_sections = 1 # Ensure there is at least one section + adjusted_chunk_threshold = total_tokens / num_sections + + total_token_so_far = 0 + current_chunk = [] + + for document in documents: + tokens = document.split(' ') + token_count = len(tokens) * self.word_token_rate + + if total_token_so_far + token_count <= adjusted_chunk_threshold: + current_chunk.extend(tokens) + total_token_so_far += token_count + else: + # Ensure to handle the last section properly + if len(sections) == num_sections - 1: + current_chunk.extend(tokens) + continue + + # Add overlap if specified + if overlap > 0 and current_chunk: + overlap_tokens = current_chunk[-overlap:] + current_chunk.extend(overlap_tokens) + + sections.append(' '.join(current_chunk)) + current_chunk = tokens + total_token_so_far = token_count + + # Add the last chunk + if current_chunk: + sections.append(' '.join(current_chunk)) + + return sections + + + def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]: + """ + Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy. + + Args: + url: The URL of the webpage. + sections: List of sections (strings) to process. + + Returns: + A list of extracted blocks or chunks. + """ + + merged_sections = self._merge( + sections, self.chunk_token_threshold, + overlap= int(self.chunk_token_threshold * self.overlap_rate) + ) + extracted_content = [] + if self.provider.startswith("groq/"): + # Sequential processing with a delay + for ix, section in enumerate(merged_sections): + extract_func = partial(self.extract, url) + extracted_content.extend(extract_func(ix, sanitize_input_encode(section))) + time.sleep(0.5) # 500 ms delay between each processing + else: + # Parallel processing using ThreadPoolExecutor + # extract_func = partial(self.extract, url) + # for ix, section in enumerate(merged_sections): + # extracted_content.append(extract_func(ix, section)) + + with ThreadPoolExecutor(max_workers=4) as executor: + extract_func = partial(self.extract, url) + futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)] + + for future in as_completed(futures): + try: + extracted_content.extend(future.result()) + except Exception as e: + if self.verbose: + print(f"Error in thread execution: {e}") + # Add error information to extracted_content + extracted_content.append({ + "index": 0, + "error": True, + "tags": ["error"], + "content": str(e) + }) + + + return extracted_content + + + def show_usage(self) -> None: + """Print a detailed token usage report showing total and per-request usage.""" + print("\n=== Token Usage Summary ===") + print(f"{'Type':<15} {'Count':>12}") + print("-" * 30) + print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}") + print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}") + print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}") + + print("\n=== Usage History ===") + print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}") + print("-" * 48) + for i, usage in enumerate(self.usages, 1): + print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}") + +####################################################### +# Strategies using clustering for text data extraction # +####################################################### + +class CosineStrategy(ExtractionStrategy): + """ + Extract meaningful blocks or chunks from the given HTML using cosine similarity. + + How it works: + 1. Pre-filter documents using embeddings and semantic_filter. + 2. Perform clustering using cosine similarity. + 3. Organize texts by their cluster labels, retaining order. + 4. Filter clusters by word count. + 5. Extract meaningful blocks or chunks from the filtered clusters. + + Attributes: + semantic_filter (str): A keyword filter for document filtering. + word_count_threshold (int): Minimum number of words per cluster. + max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters. + linkage_method (str): The linkage method for hierarchical clustering. + top_k (int): Number of top categories to extract. + model_name (str): The name of the sentence-transformers model. + sim_threshold (float): The similarity threshold for clustering. + """ + def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs): + """ + Initialize the strategy with clustering parameters. + + Args: + semantic_filter (str): A keyword filter for document filtering. + word_count_threshold (int): Minimum number of words per cluster. + max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters. + linkage_method (str): The linkage method for hierarchical clustering. + top_k (int): Number of top categories to extract. + """ + super().__init__(**kwargs) + + import numpy as np + + self.semantic_filter = semantic_filter + self.word_count_threshold = word_count_threshold + self.max_dist = max_dist + self.linkage_method = linkage_method + self.top_k = top_k + self.sim_threshold = sim_threshold + self.timer = time.time() + self.verbose = kwargs.get("verbose", False) + + self.buffer_embeddings = np.array([]) + self.get_embedding_method = "direct" + + self.device = get_device() + # import torch + # self.device = torch.device('cpu') + + self.default_batch_size = calculate_batch_size(self.device) + + if self.verbose: + print(f"[LOG] Loading Extraction Model for {self.device.type} device.") + + # if False and self.device.type == "cpu": + # self.model = load_onnx_all_MiniLM_l6_v2() + # self.tokenizer = self.model.tokenizer + # self.get_embedding_method = "direct" + # else: + + self.tokenizer, self.model = load_HF_embedding_model(model_name) + self.model.to(self.device) + self.model.eval() + + self.get_embedding_method = "batch" + + self.buffer_embeddings = np.array([]) + + # if model_name == "bert-base-uncased": + # self.tokenizer, self.model = load_bert_base_uncased() + # self.model.eval() # Ensure the model is in evaluation mode + # self.get_embedding_method = "batch" + # elif model_name == "BAAI/bge-small-en-v1.5": + # self.tokenizer, self.model = load_bge_small_en_v1_5() + # self.model.eval() # Ensure the model is in evaluation mode + # self.get_embedding_method = "batch" + # elif model_name == "sentence-transformers/all-MiniLM-L6-v2": + # self.model = load_onnx_all_MiniLM_l6_v2() + # self.tokenizer = self.model.tokenizer + # self.get_embedding_method = "direct" + + + if self.verbose: + print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.") + + self.nlp, _ = load_text_multilabel_classifier() + # self.default_batch_size = 16 if self.device.type == 'cpu' else 64 + + if self.verbose: + print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds") + + def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, at_least_k: int = 20) -> List[str]: + """ + Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding. + + Args: + documents (List[str]): A list of document texts. + semantic_filter (str): A keyword filter for document filtering. + at_least_k (int): The minimum number of documents to return. + + Returns: + List[str]: A list of filtered and sorted document texts. + """ + + if not semantic_filter: + return documents + + if len(documents) < at_least_k: + at_least_k = len(documents) // 2 + + from sklearn.metrics.pairwise import cosine_similarity + + # Compute embedding for the keyword filter + query_embedding = self.get_embeddings([semantic_filter])[0] + + # Compute embeddings for the documents + document_embeddings = self.get_embeddings(documents) + + # Calculate cosine similarity between the query embedding and document embeddings + similarities = cosine_similarity([query_embedding], document_embeddings).flatten() + + # Filter documents based on the similarity threshold + filtered_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim >= self.sim_threshold] + + # If the number of filtered documents is less than at_least_k, sort remaining documents by similarity + if len(filtered_docs) < at_least_k: + remaining_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim < self.sim_threshold] + remaining_docs.sort(key=lambda x: x[1], reverse=True) + filtered_docs.extend(remaining_docs[:at_least_k - len(filtered_docs)]) + + # Extract the document texts from the tuples + filtered_docs = [doc for doc, _ in filtered_docs] + + return filtered_docs[:at_least_k] + + def get_embeddings(self, sentences: List[str], batch_size=None, bypass_buffer=False): + """ + Get BERT embeddings for a list of sentences. + + Args: + sentences (List[str]): A list of text chunks (sentences). + + Returns: + NumPy array of embeddings. + """ + # if self.buffer_embeddings.any() and not bypass_buffer: + # return self.buffer_embeddings + + if self.device.type in [ "cpu", "gpu", "cuda", "mps"]: + import torch + # Tokenize sentences and convert to tensor + if batch_size is None: + batch_size = self.default_batch_size + + all_embeddings = [] + for i in range(0, len(sentences), batch_size): + batch_sentences = sentences[i:i + batch_size] + encoded_input = self.tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt') + encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()} + + # Ensure no gradients are calculated + with torch.no_grad(): + model_output = self.model(**encoded_input) + + # Get embeddings from the last hidden state (mean pooling) + embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy() + all_embeddings.append(embeddings) + + self.buffer_embeddings = np.vstack(all_embeddings) + elif self.device.type == "cpu": + # self.buffer_embeddings = self.model(sentences) + if batch_size is None: + batch_size = self.default_batch_size + + all_embeddings = [] + for i in range(0, len(sentences), batch_size): + batch_sentences = sentences[i:i + batch_size] + embeddings = self.model(batch_sentences) + all_embeddings.append(embeddings) + + self.buffer_embeddings = np.vstack(all_embeddings) + return self.buffer_embeddings + + def hierarchical_clustering(self, sentences: List[str], embeddings = None): + """ + Perform hierarchical clustering on sentences and return cluster labels. + + Args: + sentences (List[str]): A list of text chunks (sentences). + + Returns: + NumPy array of cluster labels. + """ + # Get embeddings + from scipy.cluster.hierarchy import linkage, fcluster + from scipy.spatial.distance import pdist + self.timer = time.time() + embeddings = self.get_embeddings(sentences, bypass_buffer=True) + # print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds") + # Compute pairwise cosine distances + distance_matrix = pdist(embeddings, 'cosine') + # Perform agglomerative clustering respecting order + linked = linkage(distance_matrix, method=self.linkage_method) + # Form flat clusters + labels = fcluster(linked, self.max_dist, criterion='distance') + return labels + + def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]: + """ + Filter clusters to remove those with a word count below the threshold. + + Args: + clusters (Dict[int, List[str]]): Dictionary of clusters. + + Returns: + Dict[int, List[str]]: Filtered dictionary of clusters. + """ + filtered_clusters = {} + for cluster_id, texts in clusters.items(): + # Concatenate texts for analysis + full_text = " ".join(texts) + # Count words + word_count = len(full_text.split()) + + # Keep clusters with word count above the threshold + if word_count >= self.word_count_threshold: + filtered_clusters[cluster_id] = texts + + return filtered_clusters + + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract clusters from HTML content using hierarchical clustering. + + Args: + url (str): The URL of the webpage. + html (str): The HTML content of the webpage. + + Returns: + List[Dict[str, Any]]: A list of processed JSON blocks. + """ + # Assume `html` is a list of text chunks for this strategy + t = time.time() + text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed + + # Pre-filter documents using embeddings and semantic_filter + text_chunks = self.filter_documents_embeddings(text_chunks, self.semantic_filter) + + if not text_chunks: + return [] + + # Perform clustering + labels = self.hierarchical_clustering(text_chunks) + # print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds") + + # Organize texts by their cluster labels, retaining order + t = time.time() + clusters = {} + for index, label in enumerate(labels): + clusters.setdefault(label, []).append(text_chunks[index]) + + # Filter clusters by word count + filtered_clusters = self.filter_clusters_by_word_count(clusters) + + # Convert filtered clusters to a sorted list of dictionaries + cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)] + + if self.verbose: + print(f"[LOG] 🚀 Assign tags using {self.device}") + + if self.device.type in ["gpu", "cuda", "mps", "cpu"]: + labels = self.nlp([cluster['content'] for cluster in cluster_list]) + + for cluster, label in zip(cluster_list, labels): + cluster['tags'] = label + # elif self.device.type == "cpu": + # # Process the text with the loaded model + # texts = [cluster['content'] for cluster in cluster_list] + # # Batch process texts + # docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"]) + + # for doc, cluster in zip(docs, cluster_list): + # tok_k = self.top_k + # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] + # cluster['tags'] = [cat for cat, _ in top_categories] + + # for cluster in cluster_list: + # doc = self.nlp(cluster['content']) + # tok_k = self.top_k + # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] + # cluster['tags'] = [cat for cat, _ in top_categories] + + if self.verbose: + print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds") + + return cluster_list + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Process sections using hierarchical clustering. + + Args: + url (str): The URL of the webpage. + sections (List[str]): List of sections (strings) to process. + + Returns: + """ + # This strategy processes all sections together + + return self.extract(url, self.DEL.join(sections), **kwargs) + +####################################################### +# New extraction strategies for JSON-based extraction # +####################################################### + +class JsonElementExtractionStrategy(ExtractionStrategy): + """ + Abstract base class for extracting structured JSON from HTML content. + + How it works: + 1. Parses HTML content using the `_parse_html` method. + 2. Uses a schema to define base selectors, fields, and transformations. + 3. Extracts data hierarchically, supporting nested fields and lists. + 4. Handles computed fields with expressions or functions. + + Attributes: + DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'. + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content. + _extract_item(element, fields): Extracts fields from a single element. + _extract_single_field(element, field): Extracts a single field based on its type. + _apply_transform(value, transform): Applies a transformation to a value. + _compute_field(item, field): Computes a field value using an expression or function. + run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy. + + Abstract Methods: + _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml). + _get_base_elements(parsed_html, selector): Retrieves base elements using a selector. + _get_elements(element, selector): Retrieves child elements using a selector. + _get_element_text(element): Extracts text content from an element. + _get_element_html(element): Extracts raw HTML from an element. + _get_element_attribute(element, attribute): Extracts an attribute's value from an element. + """ + + + DEL = '\n' + + def __init__(self, schema: Dict[str, Any], **kwargs): + """ + Initialize the JSON element extraction strategy with a schema. + + Args: + schema (Dict[str, Any]): The schema defining the extraction rules. + """ + super().__init__(**kwargs) + self.schema = schema + self.verbose = kwargs.get('verbose', False) + + def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract structured data from HTML content. + + How it works: + 1. Parses the HTML content using the `_parse_html` method. + 2. Identifies base elements using the schema's base selector. + 3. Extracts fields from each base element using `_extract_item`. + + Args: + url (str): The URL of the page being processed. + html_content (str): The raw HTML content to parse and extract. + *q: Additional positional arguments. + **kwargs: Additional keyword arguments for custom extraction. + + Returns: + List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary. + """ + + parsed_html = self._parse_html(html_content) + base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector']) + + results = [] + for element in base_elements: + # Extract base element attributes + item = {} + if 'baseFields' in self.schema: + for field in self.schema['baseFields']: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + + # Extract child fields + field_data = self._extract_item(element, self.schema['fields']) + item.update(field_data) + + if item: + results.append(item) + + return results + + @abstractmethod + def _parse_html(self, html_content: str): + """Parse HTML content into appropriate format""" + pass + + @abstractmethod + def _get_base_elements(self, parsed_html, selector: str): + """Get all base elements using the selector""" + pass + + @abstractmethod + def _get_elements(self, element, selector: str): + """Get child elements using the selector""" + pass + + def _extract_field(self, element, field): + try: + if field['type'] == 'nested': + nested_elements = self._get_elements(element, field['selector']) + nested_element = nested_elements[0] if nested_elements else None + return self._extract_item(nested_element, field['fields']) if nested_element else {} + + if field['type'] == 'list': + elements = self._get_elements(element, field['selector']) + return [self._extract_list_item(el, field['fields']) for el in elements] + + if field['type'] == 'nested_list': + elements = self._get_elements(element, field['selector']) + return [self._extract_item(el, field['fields']) for el in elements] + + return self._extract_single_field(element, field) + except Exception as e: + if self.verbose: + print(f"Error extracting field {field['name']}: {str(e)}") + return field.get('default') + + def _extract_single_field(self, element, field): + """ + Extract a single field based on its type. + + How it works: + 1. Selects the target element using the field's selector. + 2. Extracts the field value based on its type (e.g., text, attribute, regex). + 3. Applies transformations if defined in the schema. + + Args: + element: The base element to extract the field from. + field (Dict[str, Any]): The field definition in the schema. + + Returns: + Any: The extracted field value. + """ + + if 'selector' in field: + selected = self._get_elements(element, field['selector']) + if not selected: + return field.get('default') + selected = selected[0] + else: + selected = element + + value = None + if field['type'] == 'text': + value = self._get_element_text(selected) + elif field['type'] == 'attribute': + value = self._get_element_attribute(selected, field['attribute']) + elif field['type'] == 'html': + value = self._get_element_html(selected) + elif field['type'] == 'regex': + text = self._get_element_text(selected) + match = re.search(field['pattern'], text) + value = match.group(1) if match else None + + if 'transform' in field: + value = self._apply_transform(value, field['transform']) + + return value if value is not None else field.get('default') + + def _extract_list_item(self, element, fields): + item = {} + for field in fields: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _extract_item(self, element, fields): + """ + Extracts fields from a given element. + + How it works: + 1. Iterates through the fields defined in the schema. + 2. Handles computed, single, and nested field types. + 3. Updates the item dictionary with extracted field values. + + Args: + element: The base element to extract fields from. + fields (List[Dict[str, Any]]): The list of fields to extract. + + Returns: + Dict[str, Any]: A dictionary representing the extracted item. + """ + + item = {} + for field in fields: + if field['type'] == 'computed': + value = self._compute_field(item, field) + else: + value = self._extract_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _apply_transform(self, value, transform): + """ + Apply a transformation to a value. + + How it works: + 1. Checks the transformation type (e.g., `lowercase`, `strip`). + 2. Applies the transformation to the value. + 3. Returns the transformed value. + + Args: + value (str): The value to transform. + transform (str): The type of transformation to apply. + + Returns: + str: The transformed value. + """ + + if transform == 'lowercase': + return value.lower() + elif transform == 'uppercase': + return value.upper() + elif transform == 'strip': + return value.strip() + return value + + def _compute_field(self, item, field): + try: + if 'expression' in field: + return eval(field['expression'], {}, item) + elif 'function' in field: + return field['function'](item) + except Exception as e: + if self.verbose: + print(f"Error computing field {field['name']}: {str(e)}") + return field.get('default') + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Run the extraction strategy on a combined HTML content. + + How it works: + 1. Combines multiple HTML sections using the `DEL` delimiter. + 2. Calls the `extract` method with the combined HTML. + + Args: + url (str): The URL of the page being processed. + sections (List[str]): A list of HTML sections. + *q: Additional positional arguments. + **kwargs: Additional keyword arguments for custom extraction. + + Returns: + List[Dict[str, Any]]: A list of extracted items. + """ + + combined_html = self.DEL.join(sections) + return self.extract(url, combined_html, **kwargs) + + @abstractmethod + def _get_element_text(self, element) -> str: + """Get text content from element""" + pass + + @abstractmethod + def _get_element_html(self, element) -> str: + """Get HTML content from element""" + pass + + @abstractmethod + def _get_element_attribute(self, element, attribute: str): + """Get attribute value from element""" + pass + +class JsonCssExtractionStrategy(JsonElementExtractionStrategy): + """ + Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors. + + How it works: + 1. Parses HTML content with BeautifulSoup. + 2. Selects elements using CSS selectors defined in the schema. + 3. Extracts field data and applies transformations as defined. + + Attributes: + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + _parse_html(html_content): Parses HTML content into a BeautifulSoup object. + _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector. + _get_elements(element, selector): Selects child elements using a CSS selector. + _get_element_text(element): Extracts text content from a BeautifulSoup element. + _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element. + _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element. + """ + + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(schema, **kwargs) + + def _parse_html(self, html_content: str): + return BeautifulSoup(html_content, 'html.parser') + + def _get_base_elements(self, parsed_html, selector: str): + return parsed_html.select(selector) + + def _get_elements(self, element, selector: str): + # Return all matching elements using select() instead of select_one() + # This ensures that we get all elements that match the selector, not just the first one + return element.select(selector) + + def _get_element_text(self, element) -> str: + return element.get_text(strip=True) + + def _get_element_html(self, element) -> str: + return str(element) + + def _get_element_attribute(self, element, attribute: str): + return element.get(attribute) + +class JsonXPathExtractionStrategy(JsonElementExtractionStrategy): + """ + Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors. + + How it works: + 1. Parses HTML content into an lxml tree. + 2. Selects elements using XPath expressions. + 3. Converts CSS selectors to XPath when needed. + + Attributes: + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + _parse_html(html_content): Parses HTML content into an lxml tree. + _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector. + _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression. + _get_elements(element, selector): Selects child elements using an XPath selector. + _get_element_text(element): Extracts text content from an lxml element. + _get_element_html(element): Extracts the raw HTML content of an lxml element. + _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element. + """ + + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(schema, **kwargs) + + def _parse_html(self, html_content: str): + return html.fromstring(html_content) + + def _get_base_elements(self, parsed_html, selector: str): + return parsed_html.xpath(selector) + + def _css_to_xpath(self, css_selector: str) -> str: + """Convert CSS selector to XPath if needed""" + if '/' in css_selector: # Already an XPath + return css_selector + return self._basic_css_to_xpath(css_selector) + + def _basic_css_to_xpath(self, css_selector: str) -> str: + """Basic CSS to XPath conversion for common cases""" + if ' > ' in css_selector: + parts = css_selector.split(' > ') + return '//' + '/'.join(parts) + if ' ' in css_selector: + parts = css_selector.split(' ') + return '//' + '//'.join(parts) + return '//' + css_selector + + def _get_elements(self, element, selector: str): + xpath = self._css_to_xpath(selector) + if not xpath.startswith('.'): + xpath = '.' + xpath + return element.xpath(xpath) + + def _get_element_text(self, element) -> str: + return ''.join(element.xpath('.//text()')).strip() + + def _get_element_html(self, element) -> str: + return etree.tostring(element, encoding='unicode') + + def _get_element_attribute(self, element, attribute: str): + return element.get(attribute) diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c41258e0baf2d8a668c2438cda8a14a58c241bd4 --- /dev/null +++ b/crawl4ai/html2text/__init__.py @@ -0,0 +1,1141 @@ +"""html2text: Turn HTML into equivalent Markdown-structured text.""" + +import html.entities +import html.parser +import re +import string +import urllib.parse as urlparse +from textwrap import wrap +from typing import Dict, List, Optional, Tuple, Union + +from . import config +from ._typing import OutCallback +from .elements import AnchorElement, ListElement +from .utils import ( + dumb_css_parser, + element_style, + escape_md, + escape_md_section, + google_fixed_width_font, + google_has_height, + google_list_style, + google_text_emphasis, + hn, + list_numbering_start, + pad_tables_in_text, + skipwrap, + unifiable_n, +) + +__version__ = (2024, 2, 26) + + +# TODO: +# Support decoded entities with UNIFIABLE. + + +class HTML2Text(html.parser.HTMLParser): + def __init__( + self, + out: Optional[OutCallback] = None, + baseurl: str = "", + bodywidth: int = config.BODY_WIDTH, + ) -> None: + """ + Input parameters: + out: possible custom replacement for self.outtextf (which + appends lines of text). + baseurl: base URL of the document we process + """ + super().__init__(convert_charrefs=False) + + # Config options + self.split_next_td = False + self.td_count = 0 + self.table_start = False + self.unicode_snob = config.UNICODE_SNOB # covered in cli + + self.escape_snob = config.ESCAPE_SNOB # covered in cli + self.escape_backslash = config.ESCAPE_BACKSLASH # covered in cli + self.escape_dot = config.ESCAPE_DOT # covered in cli + self.escape_plus = config.ESCAPE_PLUS # covered in cli + self.escape_dash = config.ESCAPE_DASH # covered in cli + + self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH + self.body_width = bodywidth # covered in cli + self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli + self.inline_links = config.INLINE_LINKS # covered in cli + self.protect_links = config.PROTECT_LINKS # covered in cli + self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli + self.ignore_links = config.IGNORE_ANCHORS # covered in cli + self.ignore_mailto_links = config.IGNORE_MAILTO_LINKS # covered in cli + self.ignore_images = config.IGNORE_IMAGES # covered in cli + self.images_as_html = config.IMAGES_AS_HTML # covered in cli + self.images_to_alt = config.IMAGES_TO_ALT # covered in cli + self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli + self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli + self.bypass_tables = config.BYPASS_TABLES # covered in cli + self.ignore_tables = config.IGNORE_TABLES # covered in cli + self.google_doc = False # covered in cli + self.ul_item_mark = "*" # covered in cli + self.emphasis_mark = "_" # covered in cli + self.strong_mark = "**" + self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli + self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli + self.hide_strikethrough = False # covered in cli + self.mark_code = config.MARK_CODE + self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli + self.wrap_links = config.WRAP_LINKS # covered in cli + self.wrap_tables = config.WRAP_TABLES + self.pad_tables = config.PAD_TABLES # covered in cli + self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli + self.tag_callback = None + self.open_quote = config.OPEN_QUOTE # covered in cli + self.close_quote = config.CLOSE_QUOTE # covered in cli + self.include_sup_sub = config.INCLUDE_SUP_SUB # covered in cli + + if out is None: + self.out = self.outtextf + else: + self.out = out + + # empty list to store output characters before they are "joined" + self.outtextlist: List[str] = [] + + self.quiet = 0 + self.p_p = 0 # number of newline character to print before next output + self.outcount = 0 + self.start = True + self.space = False + self.a: List[AnchorElement] = [] + self.astack: List[Optional[Dict[str, Optional[str]]]] = [] + self.maybe_automatic_link: Optional[str] = None + self.empty_link = False + self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://") + self.acount = 0 + self.list: List[ListElement] = [] + self.blockquote = 0 + self.pre = False + self.startpre = False + self.code = False + self.quote = False + self.br_toggle = "" + self.lastWasNL = False + self.lastWasList = False + self.style = 0 + self.style_def: Dict[str, Dict[str, str]] = {} + self.tag_stack: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]] = [] + self.emphasis = 0 + self.drop_white_space = 0 + self.inheader = False + # Current abbreviation definition + self.abbr_title: Optional[str] = None + # Last inner HTML (for abbr being defined) + self.abbr_data: Optional[str] = None + # Stack of abbreviations to write later + self.abbr_list: Dict[str, str] = {} + self.baseurl = baseurl + self.stressed = False + self.preceding_stressed = False + self.preceding_data = "" + self.current_tag = "" + + config.UNIFIABLE["nbsp"] = " _place_holder;" + + def update_params(self, **kwargs): + for key, value in kwargs.items(): + setattr(self, key, value) + + def feed(self, data: str) -> None: + data = data.replace("", "") + super().feed(data) + + def handle(self, data: str) -> str: + self.start = True + self.feed(data) + self.feed("") + markdown = self.optwrap(self.finish()) + if self.pad_tables: + return pad_tables_in_text(markdown) + else: + return markdown + + def outtextf(self, s: str) -> None: + self.outtextlist.append(s) + if s: + self.lastWasNL = s[-1] == "\n" + + def finish(self) -> str: + self.close() + + self.pbr() + self.o("", force="end") + + outtext = "".join(self.outtextlist) + + if self.unicode_snob: + nbsp = html.entities.html5["nbsp;"] + else: + nbsp = " " + outtext = outtext.replace(" _place_holder;", nbsp) + + # Clear self.outtextlist to avoid memory leak of its content to + # the next handling. + self.outtextlist = [] + + return outtext + + def handle_charref(self, c: str) -> None: + self.handle_data(self.charref(c), True) + + def handle_entityref(self, c: str) -> None: + ref = self.entityref(c) + + # ref may be an empty string (e.g. for ‎/‏ markers that should + # not contribute to the final output). + # self.handle_data cannot handle a zero-length string right after a + # stressed tag or mid-text within a stressed tag (text get split and + # self.stressed/self.preceding_stressed gets switched after the first + # part of that text). + if ref: + self.handle_data(ref, True) + + def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: + self.handle_tag(tag, dict(attrs), start=True) + + def handle_endtag(self, tag: str) -> None: + self.handle_tag(tag, {}, start=False) + + def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]: + """ + :type attrs: dict + + :returns: The index of certain set of attributes (of a link) in the + self.a list. If the set of attributes is not found, returns None + :rtype: int + """ + if "href" not in attrs: + return None + + match = False + for i, a in enumerate(self.a): + if "href" in a.attrs and a.attrs["href"] == attrs["href"]: + if "title" in a.attrs or "title" in attrs: + if ( + "title" in a.attrs + and "title" in attrs + and a.attrs["title"] == attrs["title"] + ): + match = True + else: + match = True + + if match: + return i + return None + + def handle_emphasis( + self, start: bool, tag_style: Dict[str, str], parent_style: Dict[str, str] + ) -> None: + """ + Handles various text emphases + """ + tag_emphasis = google_text_emphasis(tag_style) + parent_emphasis = google_text_emphasis(parent_style) + + # handle Google's text emphasis + strikethrough = "line-through" in tag_emphasis and self.hide_strikethrough + + # google and others may mark a font's weight as `bold` or `700` + bold = False + for bold_marker in config.BOLD_TEXT_STYLE_VALUES: + bold = bold_marker in tag_emphasis and bold_marker not in parent_emphasis + if bold: + break + + italic = "italic" in tag_emphasis and "italic" not in parent_emphasis + fixed = ( + google_fixed_width_font(tag_style) + and not google_fixed_width_font(parent_style) + and not self.pre + ) + + if start: + # crossed-out text must be handled before other attributes + # in order not to output qualifiers unnecessarily + if bold or italic or fixed: + self.emphasis += 1 + if strikethrough: + self.quiet += 1 + if italic: + self.o(self.emphasis_mark) + self.drop_white_space += 1 + if bold: + self.o(self.strong_mark) + self.drop_white_space += 1 + if fixed: + self.o("`") + self.drop_white_space += 1 + self.code = True + else: + if bold or italic or fixed: + # there must not be whitespace before closing emphasis mark + self.emphasis -= 1 + self.space = False + if fixed: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_white_space -= 1 + else: + self.o("`") + self.code = False + if bold: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_white_space -= 1 + else: + self.o(self.strong_mark) + if italic: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_white_space -= 1 + else: + self.o(self.emphasis_mark) + # space is only allowed after *all* emphasis marks + if (bold or italic) and not self.emphasis: + self.o(" ") + if strikethrough: + self.quiet -= 1 + + def handle_tag( + self, tag: str, attrs: Dict[str, Optional[str]], start: bool + ) -> None: + self.current_tag = tag + + if self.tag_callback is not None: + if self.tag_callback(self, tag, attrs, start) is True: + return + + # first thing inside the anchor tag is another tag + # that produces some output + if ( + start + and self.maybe_automatic_link is not None + and tag not in ["p", "div", "style", "dl", "dt"] + and (tag != "img" or self.ignore_images) + ): + self.o("[") + self.maybe_automatic_link = None + self.empty_link = False + + if self.google_doc: + # the attrs parameter is empty for a closing tag. in addition, we + # need the attributes of the parent nodes in order to get a + # complete style description for the current element. we assume + # that google docs export well formed html. + parent_style: Dict[str, str] = {} + if start: + if self.tag_stack: + parent_style = self.tag_stack[-1][2] + tag_style = element_style(attrs, self.style_def, parent_style) + self.tag_stack.append((tag, attrs, tag_style)) + else: + dummy, attrs, tag_style = ( + self.tag_stack.pop() if self.tag_stack else (None, {}, {}) + ) + if self.tag_stack: + parent_style = self.tag_stack[-1][2] + + if hn(tag): + # check if nh is inside of an 'a' tag (incorrect but found in the wild) + if self.astack: + if start: + self.inheader = True + # are inside link name, so only add '#' if it can appear before '[' + if self.outtextlist and self.outtextlist[-1] == "[": + self.outtextlist.pop() + self.space = False + self.o(hn(tag) * "#" + " ") + self.o("[") + else: + self.p_p = 0 # don't break up link name + self.inheader = False + return # prevent redundant emphasis marks on headers + else: + self.p() + if start: + self.inheader = True + self.o(hn(tag) * "#" + " ") + else: + self.inheader = False + return # prevent redundant emphasis marks on headers + + if tag in ["p", "div"]: + if self.google_doc: + if start and google_has_height(tag_style): + self.p() + else: + self.soft_br() + elif self.astack: + pass + elif self.split_next_td: + pass + else: + self.p() + + if tag == "br" and start: + if self.blockquote > 0: + self.o(" \n> ") + else: + self.o(" \n") + + if tag == "hr" and start: + self.p() + self.o("* * *") + self.p() + + if tag in ["head", "style", "script"]: + if start: + self.quiet += 1 + else: + self.quiet -= 1 + + if tag == "style": + if start: + self.style += 1 + else: + self.style -= 1 + + if tag in ["body"]: + self.quiet = 0 # sites like 9rules.com never close + + if tag == "blockquote": + if start: + self.p() + self.o("> ", force=True) + self.start = True + self.blockquote += 1 + else: + self.blockquote -= 1 + self.p() + + if tag in ["em", "i", "u"] and not self.ignore_emphasis: + # Separate with a space if we immediately follow an alphanumeric + # character, since otherwise Markdown won't render the emphasis + # marks, and we'll be left with eg 'foo_bar_' visible. + # (Don't add a space otherwise, though, since there isn't one in the + # original HTML.) + if ( + start + and self.preceding_data + and self.preceding_data[-1] not in string.whitespace + and self.preceding_data[-1] not in string.punctuation + ): + emphasis = " " + self.emphasis_mark + self.preceding_data += " " + else: + emphasis = self.emphasis_mark + + self.o(emphasis) + if start: + self.stressed = True + + if tag in ["strong", "b"] and not self.ignore_emphasis: + # Separate with space if we immediately follow an * character, since + # without it, Markdown won't render the resulting *** correctly. + # (Don't add a space otherwise, though, since there isn't one in the + # original HTML.) + if ( + start + and self.preceding_data + # When `self.strong_mark` is set to empty, the next condition + # will cause IndexError since it's trying to match the data + # with the first character of the `self.strong_mark`. + and len(self.strong_mark) > 0 + and self.preceding_data[-1] == self.strong_mark[0] + ): + strong = " " + self.strong_mark + self.preceding_data += " " + else: + strong = self.strong_mark + + self.o(strong) + if start: + self.stressed = True + + if tag in ["del", "strike", "s"]: + if start and self.preceding_data and self.preceding_data[-1] == "~": + strike = " ~~" + self.preceding_data += " " + else: + strike = "~~" + + self.o(strike) + if start: + self.stressed = True + + if self.google_doc: + if not self.inheader: + # handle some font attributes, but leave headers clean + self.handle_emphasis(start, tag_style, parent_style) + + if tag in ["kbd", "code", "tt"] and not self.pre: + self.o("`") # TODO: `` `this` `` + self.code = not self.code + + if tag == "abbr": + if start: + self.abbr_title = None + self.abbr_data = "" + if "title" in attrs: + self.abbr_title = attrs["title"] + else: + if self.abbr_title is not None: + assert self.abbr_data is not None + self.abbr_list[self.abbr_data] = self.abbr_title + self.abbr_title = None + self.abbr_data = None + + if tag == "q": + if not self.quote: + self.o(self.open_quote) + else: + self.o(self.close_quote) + self.quote = not self.quote + + def link_url(self: HTML2Text, link: str, title: str = "") -> None: + url = urlparse.urljoin(self.baseurl, link) + title = ' "{}"'.format(title) if title.strip() else "" + self.o("]({url}{title})".format(url=escape_md(url), title=title)) + + if tag == "a" and not self.ignore_links: + if start: + if ( + "href" in attrs + and attrs["href"] is not None + and not (self.skip_internal_links and attrs["href"].startswith("#")) + and not ( + self.ignore_mailto_links and attrs["href"].startswith("mailto:") + ) + ): + self.astack.append(attrs) + self.maybe_automatic_link = attrs["href"] + self.empty_link = True + if self.protect_links: + attrs["href"] = "<" + attrs["href"] + ">" + else: + self.astack.append(None) + else: + if self.astack: + a = self.astack.pop() + if self.maybe_automatic_link and not self.empty_link: + self.maybe_automatic_link = None + elif a: + assert a["href"] is not None + if self.empty_link: + self.o("[") + self.empty_link = False + self.maybe_automatic_link = None + if self.inline_links: + self.p_p = 0 + title = a.get("title") or "" + title = escape_md(title) + link_url(self, a["href"], title) + else: + i = self.previousIndex(a) + if i is not None: + a_props = self.a[i] + else: + self.acount += 1 + a_props = AnchorElement(a, self.acount, self.outcount) + self.a.append(a_props) + self.o("][" + str(a_props.count) + "]") + + if tag == "img" and start and not self.ignore_images: + if "src" in attrs and attrs["src"] is not None: + if not self.images_to_alt: + attrs["href"] = attrs["src"] + alt = attrs.get("alt") or self.default_image_alt + + # If we have images_with_size, write raw html including width, + # height, and alt attributes + if self.images_as_html or ( + self.images_with_size and ("width" in attrs or "height" in attrs) + ): + self.o("") + return + + # If we have a link to create, output the start + if self.maybe_automatic_link is not None: + href = self.maybe_automatic_link + if ( + self.images_to_alt + and escape_md(alt) == href + and self.absolute_url_matcher.match(href) + ): + self.o("<" + escape_md(alt) + ">") + self.empty_link = False + return + else: + self.o("[") + self.maybe_automatic_link = None + self.empty_link = False + + # If we have images_to_alt, we discard the image itself, + # considering only the alt text. + if self.images_to_alt: + self.o(escape_md(alt)) + else: + self.o("![" + escape_md(alt) + "]") + if self.inline_links: + href = attrs.get("href") or "" + self.o( + "(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")" + ) + else: + i = self.previousIndex(attrs) + if i is not None: + a_props = self.a[i] + else: + self.acount += 1 + a_props = AnchorElement(attrs, self.acount, self.outcount) + self.a.append(a_props) + self.o("[" + str(a_props.count) + "]") + + if tag == "dl" and start: + self.p() + if tag == "dt" and not start: + self.pbr() + if tag == "dd" and start: + self.o(" ") + if tag == "dd" and not start: + self.pbr() + + if tag in ["ol", "ul"]: + # Google Docs create sub lists as top level lists + if not self.list and not self.lastWasList: + self.p() + if start: + if self.google_doc: + list_style = google_list_style(tag_style) + else: + list_style = tag + numbering_start = list_numbering_start(attrs) + self.list.append(ListElement(list_style, numbering_start)) + else: + if self.list: + self.list.pop() + if not self.google_doc and not self.list: + self.o("\n") + self.lastWasList = True + else: + self.lastWasList = False + + if tag == "li": + self.pbr() + if start: + if self.list: + li = self.list[-1] + else: + li = ListElement("ul", 0) + if self.google_doc: + self.o(" " * self.google_nest_count(tag_style)) + else: + # Indent two spaces per list, except use three spaces for an + # unordered list inside an ordered list. + # https://spec.commonmark.org/0.28/#motivation + # TODO: line up
  1. s > 9 correctly. + parent_list = None + for list in self.list: + self.o( + " " if parent_list == "ol" and list.name == "ul" else " " + ) + parent_list = list.name + + if li.name == "ul": + self.o(self.ul_item_mark + " ") + elif li.name == "ol": + li.num += 1 + self.o(str(li.num) + ". ") + self.start = True + + if tag in ["table", "tr", "td", "th"]: + if self.ignore_tables: + if tag == "tr": + if start: + pass + else: + self.soft_br() + else: + pass + + elif self.bypass_tables: + if start: + self.soft_br() + if tag in ["td", "th"]: + if start: + self.o("<{}>\n\n".format(tag)) + else: + self.o("\n".format(tag)) + else: + if start: + self.o("<{}>".format(tag)) + else: + self.o("".format(tag)) + + else: + if tag == "table": + if start: + self.table_start = True + if self.pad_tables: + self.o("<" + config.TABLE_MARKER_FOR_PAD + ">") + self.o(" \n") + else: + if self.pad_tables: + # add break in case the table is empty or its 1 row table + self.soft_br() + self.o("") + self.o(" \n") + if tag in ["td", "th"] and start: + if self.split_next_td: + self.o("| ") + self.split_next_td = True + + if tag == "tr" and start: + self.td_count = 0 + if tag == "tr" and not start: + self.split_next_td = False + self.soft_br() + if tag == "tr" and not start and self.table_start: + # Underline table header + self.o("|".join(["---"] * self.td_count)) + self.soft_br() + self.table_start = False + if tag in ["td", "th"] and start: + self.td_count += 1 + + if tag == "pre": + if start: + self.startpre = True + self.pre = True + else: + self.pre = False + if self.mark_code: + self.out("\n[/code]") + self.p() + + if tag in ["sup", "sub"] and self.include_sup_sub: + if start: + self.o("<{}>".format(tag)) + else: + self.o("".format(tag)) + + # TODO: Add docstring for these one letter functions + def pbr(self) -> None: + "Pretty print has a line break" + if self.p_p == 0: + self.p_p = 1 + + def p(self) -> None: + "Set pretty print to 1 or 2 lines" + self.p_p = 1 if self.single_line_break else 2 + + def soft_br(self) -> None: + "Soft breaks" + self.pbr() + self.br_toggle = " " + + def o( + self, data: str, puredata: bool = False, force: Union[bool, str] = False + ) -> None: + """ + Deal with indentation and whitespace + """ + if self.abbr_data is not None: + self.abbr_data += data + + if not self.quiet: + if self.google_doc: + # prevent white space immediately after 'begin emphasis' + # marks ('**' and '_') + lstripped_data = data.lstrip() + if self.drop_white_space and not (self.pre or self.code): + data = lstripped_data + if lstripped_data != "": + self.drop_white_space = 0 + + if puredata and not self.pre: + # This is a very dangerous call ... it could mess up + # all handling of   when not handled properly + # (see entityref) + data = re.sub(r"\s+", r" ", data) + if data and data[0] == " ": + self.space = True + data = data[1:] + if not data and not force: + return + + if self.startpre: + # self.out(" :") #TODO: not output when already one there + if not data.startswith("\n") and not data.startswith("\r\n"): + #
    stuff...
    +                    data = "\n" + data
    +                if self.mark_code:
    +                    self.out("\n[code]")
    +                    self.p_p = 0
    +
    +            bq = ">" * self.blockquote
    +            if not (force and data and data[0] == ">") and self.blockquote:
    +                bq += " "
    +
    +            if self.pre:
    +                if not self.list:
    +                    bq += "    "
    +                # else: list content is already partially indented
    +                bq += "    " * len(self.list)
    +                data = data.replace("\n", "\n" + bq)
    +
    +            if self.startpre:
    +                self.startpre = False
    +                if self.list:
    +                    # use existing initial indentation
    +                    data = data.lstrip("\n")
    +
    +            if self.start:
    +                self.space = False
    +                self.p_p = 0
    +                self.start = False
    +
    +            if force == "end":
    +                # It's the end.
    +                self.p_p = 0
    +                self.out("\n")
    +                self.space = False
    +
    +            if self.p_p:
    +                self.out((self.br_toggle + "\n" + bq) * self.p_p)
    +                self.space = False
    +                self.br_toggle = ""
    +
    +            if self.space:
    +                if not self.lastWasNL:
    +                    self.out(" ")
    +                self.space = False
    +
    +            if self.a and (
    +                (self.p_p == 2 and self.links_each_paragraph) or force == "end"
    +            ):
    +                if force == "end":
    +                    self.out("\n")
    +
    +                newa = []
    +                for link in self.a:
    +                    if self.outcount > link.outcount:
    +                        self.out(
    +                            "   ["
    +                            + str(link.count)
    +                            + "]: "
    +                            + urlparse.urljoin(self.baseurl, link.attrs["href"])
    +                        )
    +                        if "title" in link.attrs and link.attrs["title"] is not None:
    +                            self.out(" (" + link.attrs["title"] + ")")
    +                        self.out("\n")
    +                    else:
    +                        newa.append(link)
    +
    +                # Don't need an extra line when nothing was done.
    +                if self.a != newa:
    +                    self.out("\n")
    +
    +                self.a = newa
    +
    +            if self.abbr_list and force == "end":
    +                for abbr, definition in self.abbr_list.items():
    +                    self.out("  *[" + abbr + "]: " + definition + "\n")
    +
    +            self.p_p = 0
    +            self.out(data)
    +            self.outcount += 1
    +
    +    def handle_data(self, data: str, entity_char: bool = False) -> None:
    +        if not data:
    +            # Data may be empty for some HTML entities. For example,
    +            # LEFT-TO-RIGHT MARK.
    +            return
    +
    +        if self.stressed:
    +            data = data.strip()
    +            self.stressed = False
    +            self.preceding_stressed = True
    +        elif self.preceding_stressed:
    +            if (
    +                re.match(r"[^][(){}\s.!?]", data[0])
    +                and not hn(self.current_tag)
    +                and self.current_tag not in ["a", "code", "pre"]
    +            ):
    +                # should match a letter or common punctuation
    +                data = " " + data
    +            self.preceding_stressed = False
    +
    +        if self.style:
    +            self.style_def.update(dumb_css_parser(data))
    +
    +        if self.maybe_automatic_link is not None:
    +            href = self.maybe_automatic_link
    +            if (
    +                href == data
    +                and self.absolute_url_matcher.match(href)
    +                and self.use_automatic_links
    +            ):
    +                self.o("<" + data + ">")
    +                self.empty_link = False
    +                return
    +            else:
    +                self.o("[")
    +                self.maybe_automatic_link = None
    +                self.empty_link = False
    +
    +        if not self.code and not self.pre and not entity_char:
    +            data = escape_md_section(data, snob=self.escape_snob, escape_dot=self.escape_dot, escape_plus=self.escape_plus, escape_dash=self.escape_dash)
    +        self.preceding_data = data
    +        self.o(data, puredata=True)
    +
    +    def charref(self, name: str) -> str:
    +        if name[0] in ["x", "X"]:
    +            c = int(name[1:], 16)
    +        else:
    +            c = int(name)
    +
    +        if not self.unicode_snob and c in unifiable_n:
    +            return unifiable_n[c]
    +        else:
    +            try:
    +                return chr(c)
    +            except ValueError:  # invalid unicode
    +                return ""
    +
    +    def entityref(self, c: str) -> str:
    +        if not self.unicode_snob and c in config.UNIFIABLE:
    +            return config.UNIFIABLE[c]
    +        try:
    +            ch = html.entities.html5[c + ";"]
    +        except KeyError:
    +            return "&" + c + ";"
    +        return config.UNIFIABLE[c] if c == "nbsp" else ch
    +
    +    def google_nest_count(self, style: Dict[str, str]) -> int:
    +        """
    +        Calculate the nesting count of google doc lists
    +
    +        :type style: dict
    +
    +        :rtype: int
    +        """
    +        nest_count = 0
    +        if "margin-left" in style:
    +            nest_count = int(style["margin-left"][:-2]) // self.google_list_indent
    +
    +        return nest_count
    +
    +    def optwrap(self, text: str) -> str:
    +        """
    +        Wrap all paragraphs in the provided text.
    +
    +        :type text: str
    +
    +        :rtype: str
    +        """
    +        if not self.body_width:
    +            return text
    +
    +        result = ""
    +        newlines = 0
    +        # I cannot think of a better solution for now.
    +        # To avoid the non-wrap behaviour for entire paras
    +        # because of the presence of a link in it
    +        if not self.wrap_links:
    +            self.inline_links = False
    +        for para in text.split("\n"):
    +            if len(para) > 0:
    +                if not skipwrap(
    +                    para, self.wrap_links, self.wrap_list_items, self.wrap_tables
    +                ):
    +                    indent = ""
    +                    if para.startswith("  " + self.ul_item_mark):
    +                        # list item continuation: add a double indent to the
    +                        # new lines
    +                        indent = "    "
    +                    elif para.startswith("> "):
    +                        # blockquote continuation: add the greater than symbol
    +                        # to the new lines
    +                        indent = "> "
    +                    wrapped = wrap(
    +                        para,
    +                        self.body_width,
    +                        break_long_words=False,
    +                        subsequent_indent=indent,
    +                    )
    +                    result += "\n".join(wrapped)
    +                    if para.endswith("  "):
    +                        result += "  \n"
    +                        newlines = 1
    +                    elif indent:
    +                        result += "\n"
    +                        newlines = 1
    +                    else:
    +                        result += "\n\n"
    +                        newlines = 2
    +                else:
    +                    # Warning for the tempted!!!
    +                    # Be aware that obvious replacement of this with
    +                    # line.isspace()
    +                    # DOES NOT work! Explanations are welcome.
    +                    if not config.RE_SPACE.match(para):
    +                        result += para + "\n"
    +                        newlines = 1
    +            else:
    +                if newlines < 2:
    +                    result += "\n"
    +                    newlines += 1
    +        return result
    +
    +def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str:
    +    if bodywidth is None:
    +        bodywidth = config.BODY_WIDTH
    +    h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
    +
    +    return h.handle(html)
    +
    +class CustomHTML2Text(HTML2Text):
    +    def __init__(self, *args, handle_code_in_pre=False, **kwargs):
    +        super().__init__(*args, **kwargs)
    +        self.inside_pre = False
    +        self.inside_code = False
    +        self.preserve_tags = set()  # Set of tags to preserve
    +        self.current_preserved_tag = None
    +        self.preserved_content = []
    +        self.preserve_depth = 0
    +        self.handle_code_in_pre = handle_code_in_pre 
    +        
    +        # Configuration options
    +        self.skip_internal_links = False
    +        self.single_line_break = False
    +        self.mark_code = False
    +        self.include_sup_sub = False
    +        self.body_width = 0
    +        self.ignore_mailto_links = True
    +        self.ignore_links = False
    +        self.escape_backslash = False
    +        self.escape_dot = False
    +        self.escape_plus = False
    +        self.escape_dash = False
    +        self.escape_snob = False
    +
    +    def update_params(self, **kwargs):
    +        """Update parameters and set preserved tags."""
    +        for key, value in kwargs.items():
    +            if key == 'preserve_tags':
    +                self.preserve_tags = set(value)
    +            elif key == 'handle_code_in_pre':
    +                self.handle_code_in_pre = value
    +            else:
    +                setattr(self, key, value)
    +
    +    def handle_tag(self, tag, attrs, start):
    +        # Handle preserved tags
    +        if tag in self.preserve_tags:
    +            if start:
    +                if self.preserve_depth == 0:
    +                    self.current_preserved_tag = tag
    +                    self.preserved_content = []
    +                    # Format opening tag with attributes
    +                    attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
    +                    self.preserved_content.append(f'<{tag}{attr_str}>')
    +                self.preserve_depth += 1
    +                return
    +            else:
    +                self.preserve_depth -= 1
    +                if self.preserve_depth == 0:
    +                    self.preserved_content.append(f'')
    +                    # Output the preserved HTML block with proper spacing
    +                    preserved_html = ''.join(self.preserved_content)
    +                    self.o('\n' + preserved_html + '\n')
    +                    self.current_preserved_tag = None
    +                return
    +
    +        # If we're inside a preserved tag, collect all content
    +        if self.preserve_depth > 0:
    +            if start:
    +                # Format nested tags with attributes
    +                attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
    +                self.preserved_content.append(f'<{tag}{attr_str}>')
    +            else:
    +                self.preserved_content.append(f'')
    +            return
    +
    +        # Handle pre tags
    +        if tag == 'pre':
    +            if start:
    +                self.o('```\n')  # Markdown code block start
    +                self.inside_pre = True
    +            else:
    +                self.o('\n```\n')  # Markdown code block end
    +                self.inside_pre = False
    +        elif tag == 'code':
    +            if self.inside_pre and not self.handle_code_in_pre:
    +                # Ignore code tags inside pre blocks if handle_code_in_pre is False
    +                return
    +            if start:
    +                self.o('`')  # Markdown inline code start
    +                self.inside_code = True
    +            else:
    +                self.o('`')  # Markdown inline code end
    +                self.inside_code = False
    +        else:
    +            super().handle_tag(tag, attrs, start)
    +
    +    def handle_data(self, data, entity_char=False):
    +        """Override handle_data to capture content within preserved tags."""
    +        if self.preserve_depth > 0:
    +            self.preserved_content.append(data)
    +            return
    +
    +        if self.inside_pre:
    +            # Output the raw content for pre blocks, including content inside code tags
    +            self.o(data)  # Directly output the data as-is (preserve newlines)
    +            return
    +        if self.inside_code:
    +            # Inline code: no newlines allowed
    +            self.o(data.replace('\n', ' '))
    +            return
    +
    +        # Default behavior for other tags
    +        super().handle_data(data, entity_char)
    +
    +
    +    #     # Handle pre tags
    +    #     if tag == 'pre':
    +    #         if start:
    +    #             self.o('```\n')
    +    #             self.inside_pre = True
    +    #         else:
    +    #             self.o('\n```')
    +    #             self.inside_pre = False
    +    #     # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
    +    #     #     pass
    +    #     else:
    +    #         super().handle_tag(tag, attrs, start)
    +
    +    # def handle_data(self, data, entity_char=False):
    +    #     """Override handle_data to capture content within preserved tags."""
    +    #     if self.preserve_depth > 0:
    +    #         self.preserved_content.append(data)
    +    #         return
    +    #     super().handle_data(data, entity_char)
    diff --git a/crawl4ai/html2text/__main__.py b/crawl4ai/html2text/__main__.py
    new file mode 100644
    index 0000000000000000000000000000000000000000..4e28416e104515e90fca4b69cc60d0c61fd15d61
    --- /dev/null
    +++ b/crawl4ai/html2text/__main__.py
    @@ -0,0 +1,3 @@
    +from .cli import main
    +
    +main()
    diff --git a/crawl4ai/html2text/_typing.py b/crawl4ai/html2text/_typing.py
    new file mode 100644
    index 0000000000000000000000000000000000000000..eed83251cd381e68c0c5062ac3a50b97fbc3a483
    --- /dev/null
    +++ b/crawl4ai/html2text/_typing.py
    @@ -0,0 +1,2 @@
    +class OutCallback:
    +    def __call__(self, s: str) -> None: ...
    diff --git a/crawl4ai/html2text/cli.py b/crawl4ai/html2text/cli.py
    new file mode 100644
    index 0000000000000000000000000000000000000000..015322743d7bebb535b105d493cd6d23da64f303
    --- /dev/null
    +++ b/crawl4ai/html2text/cli.py
    @@ -0,0 +1,330 @@
    +import argparse
    +import sys
    +
    +from . import HTML2Text, __version__, config
    +
    +
    +def main() -> None:
    +    baseurl = ""
    +
    +    class bcolors:
    +        HEADER = "\033[95m"
    +        OKBLUE = "\033[94m"
    +        OKGREEN = "\033[92m"
    +        WARNING = "\033[93m"
    +        FAIL = "\033[91m"
    +        ENDC = "\033[0m"
    +        BOLD = "\033[1m"
    +        UNDERLINE = "\033[4m"
    +
    +    p = argparse.ArgumentParser()
    +    p.add_argument(
    +        "--default-image-alt",
    +        dest="default_image_alt",
    +        default=config.DEFAULT_IMAGE_ALT,
    +        help="The default alt string for images with missing ones",
    +    )
    +    p.add_argument(
    +        "--pad-tables",
    +        dest="pad_tables",
    +        action="store_true",
    +        default=config.PAD_TABLES,
    +        help="pad the cells to equal column width in tables",
    +    )
    +    p.add_argument(
    +        "--no-wrap-links",
    +        dest="wrap_links",
    +        action="store_false",
    +        default=config.WRAP_LINKS,
    +        help="don't wrap links during conversion",
    +    )
    +    p.add_argument(
    +        "--wrap-list-items",
    +        dest="wrap_list_items",
    +        action="store_true",
    +        default=config.WRAP_LIST_ITEMS,
    +        help="wrap list items during conversion",
    +    )
    +    p.add_argument(
    +        "--wrap-tables",
    +        dest="wrap_tables",
    +        action="store_true",
    +        default=config.WRAP_TABLES,
    +        help="wrap tables",
    +    )
    +    p.add_argument(
    +        "--ignore-emphasis",
    +        dest="ignore_emphasis",
    +        action="store_true",
    +        default=config.IGNORE_EMPHASIS,
    +        help="don't include any formatting for emphasis",
    +    )
    +    p.add_argument(
    +        "--reference-links",
    +        dest="inline_links",
    +        action="store_false",
    +        default=config.INLINE_LINKS,
    +        help="use reference style links instead of inline links",
    +    )
    +    p.add_argument(
    +        "--ignore-links",
    +        dest="ignore_links",
    +        action="store_true",
    +        default=config.IGNORE_ANCHORS,
    +        help="don't include any formatting for links",
    +    )
    +    p.add_argument(
    +        "--ignore-mailto-links",
    +        action="store_true",
    +        dest="ignore_mailto_links",
    +        default=config.IGNORE_MAILTO_LINKS,
    +        help="don't include mailto: links",
    +    )
    +    p.add_argument(
    +        "--protect-links",
    +        dest="protect_links",
    +        action="store_true",
    +        default=config.PROTECT_LINKS,
    +        help="protect links from line breaks surrounding them with angle brackets",
    +    )
    +    p.add_argument(
    +        "--ignore-images",
    +        dest="ignore_images",
    +        action="store_true",
    +        default=config.IGNORE_IMAGES,
    +        help="don't include any formatting for images",
    +    )
    +    p.add_argument(
    +        "--images-as-html",
    +        dest="images_as_html",
    +        action="store_true",
    +        default=config.IMAGES_AS_HTML,
    +        help=(
    +            "Always write image tags as raw html; preserves `height`, `width` and "
    +            "`alt` if possible."
    +        ),
    +    )
    +    p.add_argument(
    +        "--images-to-alt",
    +        dest="images_to_alt",
    +        action="store_true",
    +        default=config.IMAGES_TO_ALT,
    +        help="Discard image data, only keep alt text",
    +    )
    +    p.add_argument(
    +        "--images-with-size",
    +        dest="images_with_size",
    +        action="store_true",
    +        default=config.IMAGES_WITH_SIZE,
    +        help=(
    +            "Write image tags with height and width attrs as raw html to retain "
    +            "dimensions"
    +        ),
    +    )
    +    p.add_argument(
    +        "-g",
    +        "--google-doc",
    +        action="store_true",
    +        dest="google_doc",
    +        default=False,
    +        help="convert an html-exported Google Document",
    +    )
    +    p.add_argument(
    +        "-d",
    +        "--dash-unordered-list",
    +        action="store_true",
    +        dest="ul_style_dash",
    +        default=False,
    +        help="use a dash rather than a star for unordered list items",
    +    )
    +    p.add_argument(
    +        "-e",
    +        "--asterisk-emphasis",
    +        action="store_true",
    +        dest="em_style_asterisk",
    +        default=False,
    +        help="use an asterisk rather than an underscore for emphasized text",
    +    )
    +    p.add_argument(
    +        "-b",
    +        "--body-width",
    +        dest="body_width",
    +        type=int,
    +        default=config.BODY_WIDTH,
    +        help="number of characters per output line, 0 for no wrap",
    +    )
    +    p.add_argument(
    +        "-i",
    +        "--google-list-indent",
    +        dest="list_indent",
    +        type=int,
    +        default=config.GOOGLE_LIST_INDENT,
    +        help="number of pixels Google indents nested lists",
    +    )
    +    p.add_argument(
    +        "-s",
    +        "--hide-strikethrough",
    +        action="store_true",
    +        dest="hide_strikethrough",
    +        default=False,
    +        help="hide strike-through text. only relevant when -g is " "specified as well",
    +    )
    +    p.add_argument(
    +        "--escape-all",
    +        action="store_true",
    +        dest="escape_snob",
    +        default=False,
    +        help=(
    +            "Escape all special characters.  Output is less readable, but avoids "
    +            "corner case formatting issues."
    +        ),
    +    )
    +    p.add_argument(
    +        "--bypass-tables",
    +        action="store_true",
    +        dest="bypass_tables",
    +        default=config.BYPASS_TABLES,
    +        help="Format tables in HTML rather than Markdown syntax.",
    +    )
    +    p.add_argument(
    +        "--ignore-tables",
    +        action="store_true",
    +        dest="ignore_tables",
    +        default=config.IGNORE_TABLES,
    +        help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
    +    )
    +    p.add_argument(
    +        "--single-line-break",
    +        action="store_true",
    +        dest="single_line_break",
    +        default=config.SINGLE_LINE_BREAK,
    +        help=(
    +            "Use a single line break after a block element rather than two line "
    +            "breaks. NOTE: Requires --body-width=0"
    +        ),
    +    )
    +    p.add_argument(
    +        "--unicode-snob",
    +        action="store_true",
    +        dest="unicode_snob",
    +        default=config.UNICODE_SNOB,
    +        help="Use unicode throughout document",
    +    )
    +    p.add_argument(
    +        "--no-automatic-links",
    +        action="store_false",
    +        dest="use_automatic_links",
    +        default=config.USE_AUTOMATIC_LINKS,
    +        help="Do not use automatic links wherever applicable",
    +    )
    +    p.add_argument(
    +        "--no-skip-internal-links",
    +        action="store_false",
    +        dest="skip_internal_links",
    +        default=config.SKIP_INTERNAL_LINKS,
    +        help="Do not skip internal links",
    +    )
    +    p.add_argument(
    +        "--links-after-para",
    +        action="store_true",
    +        dest="links_each_paragraph",
    +        default=config.LINKS_EACH_PARAGRAPH,
    +        help="Put links after each paragraph instead of document",
    +    )
    +    p.add_argument(
    +        "--mark-code",
    +        action="store_true",
    +        dest="mark_code",
    +        default=config.MARK_CODE,
    +        help="Mark program code blocks with [code]...[/code]",
    +    )
    +    p.add_argument(
    +        "--decode-errors",
    +        dest="decode_errors",
    +        default=config.DECODE_ERRORS,
    +        help=(
    +            "What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
    +            "acceptable values"
    +        ),
    +    )
    +    p.add_argument(
    +        "--open-quote",
    +        dest="open_quote",
    +        default=config.OPEN_QUOTE,
    +        help="The character used to open quotes",
    +    )
    +    p.add_argument(
    +        "--close-quote",
    +        dest="close_quote",
    +        default=config.CLOSE_QUOTE,
    +        help="The character used to close quotes",
    +    )
    +    p.add_argument(
    +        "--version", action="version", version=".".join(map(str, __version__))
    +    )
    +    p.add_argument("filename", nargs="?")
    +    p.add_argument("encoding", nargs="?", default="utf-8")
    +    p.add_argument(
    +        "--include-sup-sub",
    +        dest="include_sup_sub",
    +        action="store_true",
    +        default=config.INCLUDE_SUP_SUB,
    +        help="Include the sup and sub tags",
    +    )
    +    args = p.parse_args()
    +
    +    if args.filename and args.filename != "-":
    +        with open(args.filename, "rb") as fp:
    +            data = fp.read()
    +    else:
    +        data = sys.stdin.buffer.read()
    +
    +    try:
    +        html = data.decode(args.encoding, args.decode_errors)
    +    except UnicodeDecodeError as err:
    +        warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
    +        warning += " Use the " + bcolors.OKGREEN
    +        warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
    +        print(warning)
    +        raise err
    +
    +    h = HTML2Text(baseurl=baseurl)
    +    # handle options
    +    if args.ul_style_dash:
    +        h.ul_item_mark = "-"
    +    if args.em_style_asterisk:
    +        h.emphasis_mark = "*"
    +        h.strong_mark = "__"
    +
    +    h.body_width = args.body_width
    +    h.google_list_indent = args.list_indent
    +    h.ignore_emphasis = args.ignore_emphasis
    +    h.ignore_links = args.ignore_links
    +    h.ignore_mailto_links = args.ignore_mailto_links
    +    h.protect_links = args.protect_links
    +    h.ignore_images = args.ignore_images
    +    h.images_as_html = args.images_as_html
    +    h.images_to_alt = args.images_to_alt
    +    h.images_with_size = args.images_with_size
    +    h.google_doc = args.google_doc
    +    h.hide_strikethrough = args.hide_strikethrough
    +    h.escape_snob = args.escape_snob
    +    h.bypass_tables = args.bypass_tables
    +    h.ignore_tables = args.ignore_tables
    +    h.single_line_break = args.single_line_break
    +    h.inline_links = args.inline_links
    +    h.unicode_snob = args.unicode_snob
    +    h.use_automatic_links = args.use_automatic_links
    +    h.skip_internal_links = args.skip_internal_links
    +    h.links_each_paragraph = args.links_each_paragraph
    +    h.mark_code = args.mark_code
    +    h.wrap_links = args.wrap_links
    +    h.wrap_list_items = args.wrap_list_items
    +    h.wrap_tables = args.wrap_tables
    +    h.pad_tables = args.pad_tables
    +    h.default_image_alt = args.default_image_alt
    +    h.open_quote = args.open_quote
    +    h.close_quote = args.close_quote
    +    h.include_sup_sub = args.include_sup_sub
    +
    +    sys.stdout.write(h.handle(html))
    diff --git a/crawl4ai/html2text/config.py b/crawl4ai/html2text/config.py
    new file mode 100644
    index 0000000000000000000000000000000000000000..d14ed64f90772ea9a3e92cc850b659f6f31756f0
    --- /dev/null
    +++ b/crawl4ai/html2text/config.py
    @@ -0,0 +1,172 @@
    +import re
    +
    +# Use Unicode characters instead of their ascii pseudo-replacements
    +UNICODE_SNOB = False
    +
    +# Marker to use for marking tables for padding post processing
    +TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
    +# Escape all special characters.  Output is less readable, but avoids
    +# corner case formatting issues.
    +ESCAPE_SNOB = False
    +ESCAPE_BACKSLASH = False
    +ESCAPE_DOT = False
    +ESCAPE_PLUS = False
    +ESCAPE_DASH = False
    +
    +# Put the links after each paragraph instead of at the end.
    +LINKS_EACH_PARAGRAPH = False
    +
    +# Wrap long lines at position. 0 for no wrapping.
    +BODY_WIDTH = 78
    +
    +# Don't show internal links (href="#local-anchor") -- corresponding link
    +# targets won't be visible in the plain text file anyway.
    +SKIP_INTERNAL_LINKS = True
    +
    +# Use inline, rather than reference, formatting for images and links
    +INLINE_LINKS = True
    +
    +# Protect links from line breaks surrounding them with angle brackets (in
    +# addition to their square brackets)
    +PROTECT_LINKS = False
    +# WRAP_LINKS = True
    +WRAP_LINKS = True
    +
    +# Wrap list items.
    +WRAP_LIST_ITEMS = False
    +
    +# Wrap tables
    +WRAP_TABLES = False
    +
    +# Number of pixels Google indents nested lists
    +GOOGLE_LIST_INDENT = 36
    +
    +# Values Google and others may use to indicate bold text
    +BOLD_TEXT_STYLE_VALUES = ("bold", "700", "800", "900")
    +
    +IGNORE_ANCHORS = False
    +IGNORE_MAILTO_LINKS = False
    +IGNORE_IMAGES = False
    +IMAGES_AS_HTML = False
    +IMAGES_TO_ALT = False
    +IMAGES_WITH_SIZE = False
    +IGNORE_EMPHASIS = False
    +MARK_CODE = False
    +DECODE_ERRORS = "strict"
    +DEFAULT_IMAGE_ALT = ""
    +PAD_TABLES = False
    +
    +# Convert links with same href and text to  format
    +# if they are absolute links
    +USE_AUTOMATIC_LINKS = True
    +
    +# For checking space-only lines on line 771
    +RE_SPACE = re.compile(r"\s\+")
    +
    +RE_ORDERED_LIST_MATCHER = re.compile(r"\d+\.\s")
    +RE_UNORDERED_LIST_MATCHER = re.compile(r"[-\*\+]\s")
    +RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
    +RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
    +
    +# to find links in the text
    +RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
    +
    +# to find table separators
    +RE_TABLE = re.compile(r" \| ")
    +
    +RE_MD_DOT_MATCHER = re.compile(
    +    r"""
    +    ^             # start of line
    +    (\s*\d+)      # optional whitespace and a number
    +    (\.)          # dot
    +    (?=\s)        # lookahead assert whitespace
    +    """,
    +    re.MULTILINE | re.VERBOSE,
    +)
    +RE_MD_PLUS_MATCHER = re.compile(
    +    r"""
    +    ^
    +    (\s*)
    +    (\+)
    +    (?=\s)
    +    """,
    +    flags=re.MULTILINE | re.VERBOSE,
    +)
    +RE_MD_DASH_MATCHER = re.compile(
    +    r"""
    +    ^
    +    (\s*)
    +    (-)
    +    (?=\s|\-)     # followed by whitespace (bullet list, or spaced out hr)
    +                  # or another dash (header or hr)
    +    """,
    +    flags=re.MULTILINE | re.VERBOSE,
    +)
    +RE_SLASH_CHARS = r"\`*_{}[]()#+-.!"
    +RE_MD_BACKSLASH_MATCHER = re.compile(
    +    r"""
    +    (\\)          # match one slash
    +    (?=[%s])      # followed by a char that requires escaping
    +    """
    +    % re.escape(RE_SLASH_CHARS),
    +    flags=re.VERBOSE,
    +)
    +
    +UNIFIABLE = {
    +    "rsquo": "'",
    +    "lsquo": "'",
    +    "rdquo": '"',
    +    "ldquo": '"',
    +    "copy": "(C)",
    +    "mdash": "--",
    +    "nbsp": " ",
    +    "rarr": "->",
    +    "larr": "<-",
    +    "middot": "*",
    +    "ndash": "-",
    +    "oelig": "oe",
    +    "aelig": "ae",
    +    "agrave": "a",
    +    "aacute": "a",
    +    "acirc": "a",
    +    "atilde": "a",
    +    "auml": "a",
    +    "aring": "a",
    +    "egrave": "e",
    +    "eacute": "e",
    +    "ecirc": "e",
    +    "euml": "e",
    +    "igrave": "i",
    +    "iacute": "i",
    +    "icirc": "i",
    +    "iuml": "i",
    +    "ograve": "o",
    +    "oacute": "o",
    +    "ocirc": "o",
    +    "otilde": "o",
    +    "ouml": "o",
    +    "ugrave": "u",
    +    "uacute": "u",
    +    "ucirc": "u",
    +    "uuml": "u",
    +    "lrm": "",
    +    "rlm": "",
    +}
    +
    +# Format tables in HTML rather than Markdown syntax
    +BYPASS_TABLES = False
    +# Ignore table-related tags (table, th, td, tr) while keeping rows
    +IGNORE_TABLES = False
    +
    +
    +# Use a single line break after a block element rather than two line breaks.
    +# NOTE: Requires body width setting to be 0.
    +SINGLE_LINE_BREAK = False
    +
    +
    +# Use double quotation marks when converting the  tag.
    +OPEN_QUOTE = '"'
    +CLOSE_QUOTE = '"'
    +
    +# Include the  and  tags
    +INCLUDE_SUP_SUB = False
    diff --git a/crawl4ai/html2text/elements.py b/crawl4ai/html2text/elements.py
    new file mode 100644
    index 0000000000000000000000000000000000000000..2533ec084e664f6c4cd19adb175325de0c844d55
    --- /dev/null
    +++ b/crawl4ai/html2text/elements.py
    @@ -0,0 +1,18 @@
    +from typing import Dict, Optional
    +
    +
    +class AnchorElement:
    +    __slots__ = ["attrs", "count", "outcount"]
    +
    +    def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount: int):
    +        self.attrs = attrs
    +        self.count = count
    +        self.outcount = outcount
    +
    +
    +class ListElement:
    +    __slots__ = ["name", "num"]
    +
    +    def __init__(self, name: str, num: int):
    +        self.name = name
    +        self.num = num
    diff --git a/crawl4ai/html2text/utils.py b/crawl4ai/html2text/utils.py
    new file mode 100644
    index 0000000000000000000000000000000000000000..1909d2cf754b57c8d1fba112a1a1eb3af81a8d3b
    --- /dev/null
    +++ b/crawl4ai/html2text/utils.py
    @@ -0,0 +1,303 @@
    +import html.entities
    +from typing import Dict, List, Optional
    +
    +from . import config
    +
    +unifiable_n = {
    +    html.entities.name2codepoint[k]: v
    +    for k, v in config.UNIFIABLE.items()
    +    if k != "nbsp"
    +}
    +
    +
    +def hn(tag: str) -> int:
    +    if tag[0] == "h" and len(tag) == 2:
    +        n = tag[1]
    +        if "0" < n <= "9":
    +            return int(n)
    +    return 0
    +
    +
    +def dumb_property_dict(style: str) -> Dict[str, str]:
    +    """
    +    :returns: A hash of css attributes
    +    """
    +    return {
    +        x.strip().lower(): y.strip().lower()
    +        for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
    +    }
    +
    +
    +def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
    +    """
    +    :type data: str
    +
    +    :returns: A hash of css selectors, each of which contains a hash of
    +    css attributes.
    +    :rtype: dict
    +    """
    +    # remove @import sentences
    +    data += ";"
    +    importIndex = data.find("@import")
    +    while importIndex != -1:
    +        data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
    +        importIndex = data.find("@import")
    +
    +    # parse the css. reverted from dictionary comprehension in order to
    +    # support older pythons
    +    pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
    +    try:
    +        elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
    +    except ValueError:
    +        elements = {}  # not that important
    +
    +    return elements
    +
    +
    +def element_style(
    +    attrs: Dict[str, Optional[str]],
    +    style_def: Dict[str, Dict[str, str]],
    +    parent_style: Dict[str, str],
    +) -> Dict[str, str]:
    +    """
    +    :type attrs: dict
    +    :type style_def: dict
    +    :type style_def: dict
    +
    +    :returns: A hash of the 'final' style attributes of the element
    +    :rtype: dict
    +    """
    +    style = parent_style.copy()
    +    if "class" in attrs:
    +        assert attrs["class"] is not None
    +        for css_class in attrs["class"].split():
    +            css_style = style_def.get("." + css_class, {})
    +            style.update(css_style)
    +    if "style" in attrs:
    +        assert attrs["style"] is not None
    +        immediate_style = dumb_property_dict(attrs["style"])
    +        style.update(immediate_style)
    +
    +    return style
    +
    +
    +def google_list_style(style: Dict[str, str]) -> str:
    +    """
    +    Finds out whether this is an ordered or unordered list
    +
    +    :type style: dict
    +
    +    :rtype: str
    +    """
    +    if "list-style-type" in style:
    +        list_style = style["list-style-type"]
    +        if list_style in ["disc", "circle", "square", "none"]:
    +            return "ul"
    +
    +    return "ol"
    +
    +
    +def google_has_height(style: Dict[str, str]) -> bool:
    +    """
    +    Check if the style of the element has the 'height' attribute
    +    explicitly defined
    +
    +    :type style: dict
    +
    +    :rtype: bool
    +    """
    +    return "height" in style
    +
    +
    +def google_text_emphasis(style: Dict[str, str]) -> List[str]:
    +    """
    +    :type style: dict
    +
    +    :returns: A list of all emphasis modifiers of the element
    +    :rtype: list
    +    """
    +    emphasis = []
    +    if "text-decoration" in style:
    +        emphasis.append(style["text-decoration"])
    +    if "font-style" in style:
    +        emphasis.append(style["font-style"])
    +    if "font-weight" in style:
    +        emphasis.append(style["font-weight"])
    +
    +    return emphasis
    +
    +
    +def google_fixed_width_font(style: Dict[str, str]) -> bool:
    +    """
    +    Check if the css of the current element defines a fixed width font
    +
    +    :type style: dict
    +
    +    :rtype: bool
    +    """
    +    font_family = ""
    +    if "font-family" in style:
    +        font_family = style["font-family"]
    +    return "courier new" == font_family or "consolas" == font_family
    +
    +
    +def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
    +    """
    +    Extract numbering from list element attributes
    +
    +    :type attrs: dict
    +
    +    :rtype: int or None
    +    """
    +    if "start" in attrs:
    +        assert attrs["start"] is not None
    +        try:
    +            return int(attrs["start"]) - 1
    +        except ValueError:
    +            pass
    +
    +    return 0
    +
    +
    +def skipwrap(
    +    para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
    +) -> bool:
    +    # If it appears to contain a link
    +    # don't wrap
    +    if not wrap_links and config.RE_LINK.search(para):
    +        return True
    +    # If the text begins with four spaces or one tab, it's a code block;
    +    # don't wrap
    +    if para[0:4] == "    " or para[0] == "\t":
    +        return True
    +
    +    # If the text begins with only two "--", possibly preceded by
    +    # whitespace, that's an emdash; so wrap.
    +    stripped = para.lstrip()
    +    if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
    +        return False
    +
    +    # I'm not sure what this is for; I thought it was to detect lists,
    +    # but there's a 
    -inside- case in one of the tests that + # also depends upon it. + if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**": + return not wrap_list_items + + # If text contains a pipe character it is likely a table + if not wrap_tables and config.RE_TABLE.search(para): + return True + + # If the text begins with a single -, *, or +, followed by a space, + # or an integer, followed by a ., followed by a space (in either + # case optionally proceeded by whitespace), it's a list; don't wrap. + return bool( + config.RE_ORDERED_LIST_MATCHER.match(stripped) + or config.RE_UNORDERED_LIST_MATCHER.match(stripped) + ) + + +def escape_md(text: str) -> str: + """ + Escapes markdown-sensitive characters within other markdown + constructs. + """ + return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text) + + +def escape_md_section( + text: str, + escape_backslash: bool = True, + snob: bool = False, + escape_dot: bool = True, + escape_plus: bool = True, + escape_dash: bool = True +) -> str: + """ + Escapes markdown-sensitive characters across whole document sections. + Each escaping operation can be controlled individually. + """ + if escape_backslash: + text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text) + + if snob: + text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text) + + if escape_dot: + text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text) + + if escape_plus: + text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text) + + if escape_dash: + text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) + + return text + +def reformat_table(lines: List[str], right_margin: int) -> List[str]: + """ + Given the lines of a table + padds the cells and returns the new lines + """ + # find the maximum width of the columns + max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")] + max_cols = len(max_width) + for line in lines: + cols = [x.rstrip() for x in line.split("|")] + num_cols = len(cols) + + # don't drop any data if colspan attributes result in unequal lengths + if num_cols < max_cols: + cols += [""] * (max_cols - num_cols) + elif max_cols < num_cols: + max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]] + max_cols = num_cols + + max_width = [ + max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width) + ] + + # reformat + new_lines = [] + for line in lines: + cols = [x.rstrip() for x in line.split("|")] + if set(line.strip()) == set("-|"): + filler = "-" + new_cols = [ + x.rstrip() + (filler * (M - len(x.rstrip()))) + for x, M in zip(cols, max_width) + ] + new_lines.append("|-" + "|".join(new_cols) + "|") + else: + filler = " " + new_cols = [ + x.rstrip() + (filler * (M - len(x.rstrip()))) + for x, M in zip(cols, max_width) + ] + new_lines.append("| " + "|".join(new_cols) + "|") + return new_lines + + +def pad_tables_in_text(text: str, right_margin: int = 1) -> str: + """ + Provide padding for tables in the text + """ + lines = text.split("\n") + table_buffer = [] # type: List[str] + table_started = False + new_lines = [] + for line in lines: + # Toggle table started + if config.TABLE_MARKER_FOR_PAD in line: + table_started = not table_started + if not table_started: + table = reformat_table(table_buffer, right_margin) + new_lines.extend(table) + table_buffer = [] + new_lines.append("") + continue + # Process lines + if table_started: + table_buffer.append(line) + else: + new_lines.append(line) + return "\n".join(new_lines) diff --git a/crawl4ai/install.py b/crawl4ai/install.py new file mode 100644 index 0000000000000000000000000000000000000000..7efb6800b1d7eb9a9edf5fea639f92c982fbe7b8 --- /dev/null +++ b/crawl4ai/install.py @@ -0,0 +1,83 @@ +import subprocess +import sys +import asyncio +from .async_logger import AsyncLogger, LogLevel + +# Initialize logger +logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) + +def post_install(): + """Run all post-installation tasks""" + logger.info("Running post-installation setup...", tag="INIT") + install_playwright() + run_migration() + logger.success("Post-installation setup completed!", tag="COMPLETE") + +def install_playwright(): + logger.info("Installing Playwright browsers...", tag="INIT") + try: + # subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chrome"]) + subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chromium"]) + logger.success("Playwright installation completed successfully.", tag="COMPLETE") + except subprocess.CalledProcessError as e: + # logger.error(f"Error during Playwright installation: {e}", tag="ERROR") + logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.") + except Exception as e: + # logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR") + logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.") + +def run_migration(): + """Initialize database during installation""" + try: + logger.info("Starting database initialization...", tag="INIT") + from crawl4ai.async_database import async_db_manager + + asyncio.run(async_db_manager.initialize()) + logger.success("Database initialization completed successfully.", tag="COMPLETE") + except ImportError: + logger.warning("Database module not found. Will initialize on first use.") + except Exception as e: + logger.warning(f"Database initialization failed: {e}") + logger.warning("Database will be initialized on first use") + +async def run_doctor(): + """Test if Crawl4AI is working properly""" + logger.info("Running Crawl4AI health check...", tag="INIT") + try: + from .async_webcrawler import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + + browser_config = BrowserConfig( + headless=True, + browser_type="chromium", + ignore_https_errors=True, + light_mode=True, + viewport_width=1280, + viewport_height=720 + ) + + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + screenshot=True, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + logger.info("Testing crawling capabilities...", tag="TEST") + result = await crawler.arun( + url="https://crawl4ai.com", + config=run_config + ) + + if result and result.markdown: + logger.success("✅ Crawling test passed!", tag="COMPLETE") + return True + else: + raise Exception("Failed to get content") + + except Exception as e: + logger.error(f"❌ Test failed: {e}", tag="ERROR") + return False + +def doctor(): + """Entry point for the doctor command""" + import asyncio + return asyncio.run(run_doctor()) diff --git a/crawl4ai/js_snippet/__init__.py b/crawl4ai/js_snippet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..73b0c2dd343d45483a5783ecc8d05fa459af9cd6 --- /dev/null +++ b/crawl4ai/js_snippet/__init__.py @@ -0,0 +1,15 @@ +import os, sys + +# Create a function get name of a js script, then load from the CURRENT folder of this script and return its content as string, make sure its error free +def load_js_script(script_name): + # Get the path of the current script + current_script_path = os.path.dirname(os.path.realpath(__file__)) + # Get the path of the script to load + script_path = os.path.join(current_script_path, script_name + '.js') + # Check if the script exists + if not os.path.exists(script_path): + raise ValueError(f"Script {script_name} not found in the folder {current_script_path}") + # Load the content of the script + with open(script_path, 'r') as f: + script_content = f.read() + return script_content diff --git a/crawl4ai/js_snippet/navigator_overrider.js b/crawl4ai/js_snippet/navigator_overrider.js new file mode 100644 index 0000000000000000000000000000000000000000..f341ceeb743bfaea669a7bdf378844586f52c5f2 --- /dev/null +++ b/crawl4ai/js_snippet/navigator_overrider.js @@ -0,0 +1,25 @@ +// Pass the Permissions Test. +const originalQuery = window.navigator.permissions.query; +window.navigator.permissions.query = (parameters) => + parameters.name === "notifications" + ? Promise.resolve({ state: Notification.permission }) + : originalQuery(parameters); +Object.defineProperty(navigator, "webdriver", { + get: () => undefined, +}); +window.navigator.chrome = { + runtime: {}, + // Add other properties if necessary +}; +Object.defineProperty(navigator, "plugins", { + get: () => [1, 2, 3, 4, 5], +}); +Object.defineProperty(navigator, "languages", { + get: () => ["en-US", "en"], +}); +Object.defineProperty(document, "hidden", { + get: () => false, +}); +Object.defineProperty(document, "visibilityState", { + get: () => "visible", +}); diff --git a/crawl4ai/js_snippet/remove_overlay_elements.js b/crawl4ai/js_snippet/remove_overlay_elements.js new file mode 100644 index 0000000000000000000000000000000000000000..0400d89c40a9c0206ddf6d8110d0c2939a29af8e --- /dev/null +++ b/crawl4ai/js_snippet/remove_overlay_elements.js @@ -0,0 +1,119 @@ +async () => { + // Function to check if element is visible + const isVisible = (elem) => { + const style = window.getComputedStyle(elem); + return style.display !== "none" && style.visibility !== "hidden" && style.opacity !== "0"; + }; + + // Common selectors for popups and overlays + const commonSelectors = [ + // Close buttons first + 'button[class*="close" i]', + 'button[class*="dismiss" i]', + 'button[aria-label*="close" i]', + 'button[title*="close" i]', + 'a[class*="close" i]', + 'span[class*="close" i]', + + // Cookie notices + '[class*="cookie-banner" i]', + '[id*="cookie-banner" i]', + '[class*="cookie-consent" i]', + '[id*="cookie-consent" i]', + + // Newsletter/subscription dialogs + '[class*="newsletter" i]', + '[class*="subscribe" i]', + + // Generic popups/modals + '[class*="popup" i]', + '[class*="modal" i]', + '[class*="overlay" i]', + '[class*="dialog" i]', + '[role="dialog"]', + '[role="alertdialog"]', + ]; + + // Try to click close buttons first + for (const selector of commonSelectors.slice(0, 6)) { + const closeButtons = document.querySelectorAll(selector); + for (const button of closeButtons) { + if (isVisible(button)) { + try { + button.click(); + await new Promise((resolve) => setTimeout(resolve, 100)); + } catch (e) { + console.log("Error clicking button:", e); + } + } + } + } + + // Remove remaining overlay elements + const removeOverlays = () => { + // Find elements with high z-index + const allElements = document.querySelectorAll("*"); + for (const elem of allElements) { + const style = window.getComputedStyle(elem); + const zIndex = parseInt(style.zIndex); + const position = style.position; + + if ( + isVisible(elem) && + (zIndex > 999 || position === "fixed" || position === "absolute") && + (elem.offsetWidth > window.innerWidth * 0.5 || + elem.offsetHeight > window.innerHeight * 0.5 || + style.backgroundColor.includes("rgba") || + parseFloat(style.opacity) < 1) + ) { + elem.remove(); + } + } + + // Remove elements matching common selectors + for (const selector of commonSelectors) { + const elements = document.querySelectorAll(selector); + elements.forEach((elem) => { + if (isVisible(elem)) { + elem.remove(); + } + }); + } + }; + + // Remove overlay elements + removeOverlays(); + + // Remove any fixed/sticky position elements at the top/bottom + const removeFixedElements = () => { + const elements = document.querySelectorAll("*"); + elements.forEach((elem) => { + const style = window.getComputedStyle(elem); + if ((style.position === "fixed" || style.position === "sticky") && isVisible(elem)) { + elem.remove(); + } + }); + }; + + removeFixedElements(); + + // Remove empty block elements as: div, p, span, etc. + const removeEmptyBlockElements = () => { + const blockElements = document.querySelectorAll( + "div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6" + ); + blockElements.forEach((elem) => { + if (elem.innerText.trim() === "") { + elem.remove(); + } + }); + }; + + // Remove margin-right and padding-right from body (often added by modal scripts) + document.body.style.marginRight = "0px"; + document.body.style.paddingRight = "0px"; + document.body.style.overflow = "auto"; + + // Wait a bit for any animations to complete + await new Promise((resolve) => setTimeout(resolve, 100)); +}; diff --git a/crawl4ai/js_snippet/update_image_dimensions.js b/crawl4ai/js_snippet/update_image_dimensions.js new file mode 100644 index 0000000000000000000000000000000000000000..709a35d5143227718ef2a5c29385f1346af4de40 --- /dev/null +++ b/crawl4ai/js_snippet/update_image_dimensions.js @@ -0,0 +1,54 @@ +() => { + return new Promise((resolve) => { + const filterImage = (img) => { + // Filter out images that are too small + if (img.width < 100 && img.height < 100) return false; + + // Filter out images that are not visible + const rect = img.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) return false; + + // Filter out images with certain class names (e.g., icons, thumbnails) + if (img.classList.contains("icon") || img.classList.contains("thumbnail")) return false; + + // Filter out images with certain patterns in their src (e.g., placeholder images) + if (img.src.includes("placeholder") || img.src.includes("icon")) return false; + + return true; + }; + + const images = Array.from(document.querySelectorAll("img")).filter(filterImage); + let imagesLeft = images.length; + + if (imagesLeft === 0) { + resolve(); + return; + } + + const checkImage = (img) => { + if (img.complete && img.naturalWidth !== 0) { + img.setAttribute("width", img.naturalWidth); + img.setAttribute("height", img.naturalHeight); + imagesLeft--; + if (imagesLeft === 0) resolve(); + } + }; + + images.forEach((img) => { + checkImage(img); + if (!img.complete) { + img.onload = () => { + checkImage(img); + }; + img.onerror = () => { + imagesLeft--; + if (imagesLeft === 0) resolve(); + }; + } + }); + + // Fallback timeout of 5 seconds + // setTimeout(() => resolve(), 5000); + resolve(); + }); +}; diff --git a/crawl4ai/llmtxt.py b/crawl4ai/llmtxt.py new file mode 100644 index 0000000000000000000000000000000000000000..94efe0767995af580e2f75c9b6a13f4be0f8d811 --- /dev/null +++ b/crawl4ai/llmtxt.py @@ -0,0 +1,498 @@ +import os +from pathlib import Path +import re +from typing import Dict, List, Tuple, Optional, Any +import json +from tqdm import tqdm +import time +import psutil +import numpy as np +from rank_bm25 import BM25Okapi +from nltk.tokenize import word_tokenize +from nltk.corpus import stopwords +from nltk.stem import WordNetLemmatizer +from litellm import completion, batch_completion +from .async_logger import AsyncLogger +import litellm +import pickle +import hashlib # <--- ADDED for file-hash +from fnmatch import fnmatch +import glob + +litellm.set_verbose = False + +def _compute_file_hash(file_path: Path) -> str: + """Compute MD5 hash for the file's entire content.""" + hash_md5 = hashlib.md5() + with file_path.open("rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + +class AsyncLLMTextManager: + def __init__( + self, + docs_dir: Path, + logger: Optional[AsyncLogger] = None, + max_concurrent_calls: int = 5, + batch_size: int = 3 + ) -> None: + self.docs_dir = docs_dir + self.logger = logger + self.max_concurrent_calls = max_concurrent_calls + self.batch_size = batch_size + self.bm25_index = None + self.document_map: Dict[str, Any] = {} + self.tokenized_facts: List[str] = [] + self.bm25_index_file = self.docs_dir / "bm25_index.pkl" + + async def _process_document_batch(self, doc_batch: List[Path]) -> None: + """Process a batch of documents in parallel""" + contents = [] + for file_path in doc_batch: + try: + with open(file_path, 'r', encoding='utf-8') as f: + contents.append(f.read()) + except Exception as e: + self.logger.error(f"Error reading {file_path}: {str(e)}") + contents.append("") # Add empty content to maintain batch alignment + + prompt = """Given a documentation file, generate a list of atomic facts where each fact: +1. Represents a single piece of knowledge +2. Contains variations in terminology for the same concept +3. References relevant code patterns if they exist +4. Is written in a way that would match natural language queries + +Each fact should follow this format: +: | | + +Example Facts: +browser_config: Configure headless mode and browser type for AsyncWebCrawler | headless, browser_type, chromium, firefox | BrowserConfig(browser_type="chromium", headless=True) +redis_connection: Redis client connection requires host and port configuration | redis setup, redis client, connection params | Redis(host='localhost', port=6379, db=0) +pandas_filtering: Filter DataFrame rows using boolean conditions | dataframe filter, query, boolean indexing | df[df['column'] > 5] + +Wrap your response in ... tags. +""" + + # Prepare messages for batch processing + messages_list = [ + [ + {"role": "user", "content": f"{prompt}\n\nGenerate index for this documentation:\n\n{content}"} + ] + for content in contents if content + ] + + try: + responses = batch_completion( + model="anthropic/claude-3-5-sonnet-latest", + messages=messages_list, + logger_fn=None + ) + + # Process responses and save index files + for response, file_path in zip(responses, doc_batch): + try: + index_content_match = re.search( + r'(.*?)', + response.choices[0].message.content, + re.DOTALL + ) + if not index_content_match: + self.logger.warning(f"No ... content found for {file_path}") + continue + + index_content = re.sub( + r"\n\s*\n", "\n", index_content_match.group(1) + ).strip() + if index_content: + index_file = file_path.with_suffix('.q.md') + with open(index_file, 'w', encoding='utf-8') as f: + f.write(index_content) + self.logger.info(f"Created index file: {index_file}") + else: + self.logger.warning(f"No index content found in response for {file_path}") + + except Exception as e: + self.logger.error(f"Error processing response for {file_path}: {str(e)}") + + except Exception as e: + self.logger.error(f"Error in batch completion: {str(e)}") + + def _validate_fact_line(self, line: str) -> Tuple[bool, Optional[str]]: + if "|" not in line: + return False, "Missing separator '|'" + + parts = [p.strip() for p in line.split("|")] + if len(parts) != 3: + return False, f"Expected 3 parts, got {len(parts)}" + + concept_part = parts[0] + if ":" not in concept_part: + return False, "Missing ':' in concept definition" + + return True, None + + def _load_or_create_token_cache(self, fact_file: Path) -> Dict: + """ + Load token cache from .q.tokens if present and matching file hash. + Otherwise return a new structure with updated file-hash. + """ + cache_file = fact_file.with_suffix(".q.tokens") + current_hash = _compute_file_hash(fact_file) + + if cache_file.exists(): + try: + with open(cache_file, "r") as f: + cache = json.load(f) + # If the hash matches, return it directly + if cache.get("content_hash") == current_hash: + return cache + # Otherwise, we signal that it's changed + self.logger.info(f"Hash changed for {fact_file}, reindex needed.") + except json.JSONDecodeError: + self.logger.warning(f"Corrupt token cache for {fact_file}, rebuilding.") + except Exception as e: + self.logger.warning(f"Error reading cache for {fact_file}: {str(e)}") + + # Return a fresh cache + return {"facts": {}, "content_hash": current_hash} + + def _save_token_cache(self, fact_file: Path, cache: Dict) -> None: + cache_file = fact_file.with_suffix(".q.tokens") + # Always ensure we're saving the correct file-hash + cache["content_hash"] = _compute_file_hash(fact_file) + with open(cache_file, "w") as f: + json.dump(cache, f) + + def preprocess_text(self, text: str) -> List[str]: + parts = [x.strip() for x in text.split("|")] if "|" in text else [text] + # Remove : after the first word of parts[0] + parts[0] = re.sub(r"^(.*?):", r"\1", parts[0]) + + lemmatizer = WordNetLemmatizer() + stop_words = set(stopwords.words("english")) - { + "how", "what", "when", "where", "why", "which", + } + + tokens = [] + for part in parts: + if "(" in part and ")" in part: + code_tokens = re.findall( + r'[\w_]+(?=\()|[\w_]+(?==[\'"]{1}[\w_]+[\'"]{1})', part + ) + tokens.extend(code_tokens) + + words = word_tokenize(part.lower()) + tokens.extend( + [ + lemmatizer.lemmatize(token) + for token in words + if token not in stop_words + ] + ) + + return tokens + + def maybe_load_bm25_index(self, clear_cache=False) -> bool: + """ + Load existing BM25 index from disk, if present and clear_cache=False. + """ + if not clear_cache and os.path.exists(self.bm25_index_file): + self.logger.info("Loading existing BM25 index from disk.") + with open(self.bm25_index_file, "rb") as f: + data = pickle.load(f) + self.tokenized_facts = data["tokenized_facts"] + self.bm25_index = data["bm25_index"] + return True + return False + + def build_search_index(self, clear_cache=False) -> None: + """ + Checks for new or modified .q.md files by comparing file-hash. + If none need reindexing and clear_cache is False, loads existing index if available. + Otherwise, reindexes only changed/new files and merges or creates a new index. + """ + # If clear_cache is True, we skip partial logic: rebuild everything from scratch + if clear_cache: + self.logger.info("Clearing cache and rebuilding full search index.") + if self.bm25_index_file.exists(): + self.bm25_index_file.unlink() + + process = psutil.Process() + self.logger.info("Checking which .q.md files need (re)indexing...") + + # Gather all .q.md files + q_files = [self.docs_dir / f for f in os.listdir(self.docs_dir) if f.endswith(".q.md")] + + # We'll store known (unchanged) facts in these lists + existing_facts: List[str] = [] + existing_tokens: List[List[str]] = [] + + # Keep track of invalid lines for logging + invalid_lines = [] + needSet = [] # files that must be (re)indexed + + for qf in q_files: + token_cache_file = qf.with_suffix(".q.tokens") + + # If no .q.tokens or clear_cache is True → definitely reindex + if clear_cache or not token_cache_file.exists(): + needSet.append(qf) + continue + + # Otherwise, load the existing cache and compare hash + cache = self._load_or_create_token_cache(qf) + # If the .q.tokens was out of date (i.e. changed hash), we reindex + if len(cache["facts"]) == 0 or cache.get("content_hash") != _compute_file_hash(qf): + needSet.append(qf) + else: + # File is unchanged → retrieve cached token data + for line, cache_data in cache["facts"].items(): + existing_facts.append(line) + existing_tokens.append(cache_data["tokens"]) + self.document_map[line] = qf # track the doc for that fact + + if not needSet and not clear_cache: + # If no file needs reindexing, try loading existing index + if self.maybe_load_bm25_index(clear_cache=False): + self.logger.info("No new/changed .q.md files found. Using existing BM25 index.") + return + else: + # If there's no existing index, we must build a fresh index from the old caches + self.logger.info("No existing BM25 index found. Building from cached facts.") + if existing_facts: + self.logger.info(f"Building BM25 index with {len(existing_facts)} cached facts.") + self.bm25_index = BM25Okapi(existing_tokens) + self.tokenized_facts = existing_facts + with open(self.bm25_index_file, "wb") as f: + pickle.dump({ + "bm25_index": self.bm25_index, + "tokenized_facts": self.tokenized_facts + }, f) + else: + self.logger.warning("No facts found at all. Index remains empty.") + return + + # ----------------------------------------------------- /Users/unclecode/.crawl4ai/docs/14_proxy_security.q.q.tokens '/Users/unclecode/.crawl4ai/docs/14_proxy_security.q.md' + # If we reach here, we have new or changed .q.md files + # We'll parse them, reindex them, and then combine with existing_facts + # ----------------------------------------------------- + + self.logger.info(f"{len(needSet)} file(s) need reindexing. Parsing now...") + + # 1) Parse the new or changed .q.md files + new_facts = [] + new_tokens = [] + with tqdm(total=len(needSet), desc="Indexing changed files") as file_pbar: + for file in needSet: + # We'll build up a fresh cache + fresh_cache = {"facts": {}, "content_hash": _compute_file_hash(file)} + try: + with open(file, "r", encoding="utf-8") as f_obj: + content = f_obj.read().strip() + lines = [l.strip() for l in content.split("\n") if l.strip()] + + for line in lines: + is_valid, error = self._validate_fact_line(line) + if not is_valid: + invalid_lines.append((file, line, error)) + continue + + tokens = self.preprocess_text(line) + fresh_cache["facts"][line] = { + "tokens": tokens, + "added": time.time(), + } + new_facts.append(line) + new_tokens.append(tokens) + self.document_map[line] = file + + # Save the new .q.tokens with updated hash + self._save_token_cache(file, fresh_cache) + + mem_usage = process.memory_info().rss / 1024 / 1024 + self.logger.debug(f"Memory usage after {file.name}: {mem_usage:.2f}MB") + + except Exception as e: + self.logger.error(f"Error processing {file}: {str(e)}") + + file_pbar.update(1) + + if invalid_lines: + self.logger.warning(f"Found {len(invalid_lines)} invalid fact lines:") + for file, line, error in invalid_lines: + self.logger.warning(f"{file}: {error} in line: {line[:50]}...") + + # 2) Merge newly tokenized facts with the existing ones + all_facts = existing_facts + new_facts + all_tokens = existing_tokens + new_tokens + + # 3) Build BM25 index from combined facts + self.logger.info(f"Building BM25 index with {len(all_facts)} total facts (old + new).") + self.bm25_index = BM25Okapi(all_tokens) + self.tokenized_facts = all_facts + + # 4) Save the updated BM25 index to disk + with open(self.bm25_index_file, "wb") as f: + pickle.dump({ + "bm25_index": self.bm25_index, + "tokenized_facts": self.tokenized_facts + }, f) + + final_mem = process.memory_info().rss / 1024 / 1024 + self.logger.info(f"Search index updated. Final memory usage: {final_mem:.2f}MB") + + async def generate_index_files(self, force_generate_facts: bool = False, clear_bm25_cache: bool = False) -> None: + """ + Generate index files for all documents in parallel batches + + Args: + force_generate_facts (bool): If True, regenerate indexes even if they exist + clear_bm25_cache (bool): If True, clear existing BM25 index cache + """ + self.logger.info("Starting index generation for documentation files.") + + md_files = [ + self.docs_dir / f for f in os.listdir(self.docs_dir) + if f.endswith('.md') and not any(f.endswith(x) for x in ['.q.md', '.xs.md']) + ] + + # Filter out files that already have .q files unless force=True + if not force_generate_facts: + md_files = [ + f for f in md_files + if not (self.docs_dir / f.name.replace('.md', '.q.md')).exists() + ] + + if not md_files: + self.logger.info("All index files exist. Use force=True to regenerate.") + else: + # Process documents in batches + for i in range(0, len(md_files), self.batch_size): + batch = md_files[i:i + self.batch_size] + self.logger.info(f"Processing batch {i//self.batch_size + 1}/{(len(md_files)//self.batch_size) + 1}") + await self._process_document_batch(batch) + + self.logger.info("Index generation complete, building/updating search index.") + self.build_search_index(clear_cache=clear_bm25_cache) + + def generate(self, sections: List[str], mode: str = "extended") -> str: + # Get all markdown files + all_files = glob.glob(str(self.docs_dir / "[0-9]*.md")) + \ + glob.glob(str(self.docs_dir / "[0-9]*.xs.md")) + + # Extract base names without extensions + base_docs = {Path(f).name.split('.')[0] for f in all_files + if not Path(f).name.endswith('.q.md')} + + # Filter by sections if provided + if sections: + base_docs = {doc for doc in base_docs + if any(section.lower() in doc.lower() for section in sections)} + + # Get file paths based on mode + files = [] + for doc in sorted(base_docs, key=lambda x: int(x.split('_')[0]) if x.split('_')[0].isdigit() else 999999): + if mode == "condensed": + xs_file = self.docs_dir / f"{doc}.xs.md" + regular_file = self.docs_dir / f"{doc}.md" + files.append(str(xs_file if xs_file.exists() else regular_file)) + else: + files.append(str(self.docs_dir / f"{doc}.md")) + + # Read and format content + content = [] + for file in files: + try: + with open(file, 'r', encoding='utf-8') as f: + fname = Path(file).name + content.append(f"{'#'*20}\n# {fname}\n{'#'*20}\n\n{f.read()}") + except Exception as e: + self.logger.error(f"Error reading {file}: {str(e)}") + + return "\n\n---\n\n".join(content) if content else "" + + def search(self, query: str, top_k: int = 5) -> str: + if not self.bm25_index: + return "No search index available. Call build_search_index() first." + + query_tokens = self.preprocess_text(query) + doc_scores = self.bm25_index.get_scores(query_tokens) + + mean_score = np.mean(doc_scores) + std_score = np.std(doc_scores) + score_threshold = mean_score + (0.25 * std_score) + + file_data = self._aggregate_search_scores( + doc_scores=doc_scores, + score_threshold=score_threshold, + query_tokens=query_tokens, + ) + + ranked_files = sorted( + file_data.items(), + key=lambda x: ( + x[1]["code_match_score"] * 2.0 + + x[1]["match_count"] * 1.5 + + x[1]["total_score"] + ), + reverse=True, + )[:top_k] + + results = [] + for file, _ in ranked_files: + main_doc = str(file).replace(".q.md", ".md") + if os.path.exists(self.docs_dir / main_doc): + with open(self.docs_dir / main_doc, "r", encoding='utf-8') as f: + only_file_name = main_doc.split("/")[-1] + content = [ + "#" * 20, + f"# {only_file_name}", + "#" * 20, + "", + f.read() + ] + results.append("\n".join(content)) + + return "\n\n---\n\n".join(results) + + def _aggregate_search_scores( + self, doc_scores: List[float], score_threshold: float, query_tokens: List[str] + ) -> Dict: + file_data = {} + + for idx, score in enumerate(doc_scores): + if score <= score_threshold: + continue + + fact = self.tokenized_facts[idx] + file_path = self.document_map[fact] + + if file_path not in file_data: + file_data[file_path] = { + "total_score": 0, + "match_count": 0, + "code_match_score": 0, + "matched_facts": [], + } + + components = fact.split("|") if "|" in fact else [fact] + + code_match_score = 0 + if len(components) == 3: + code_ref = components[2].strip() + code_tokens = self.preprocess_text(code_ref) + code_match_score = len(set(query_tokens) & set(code_tokens)) / len(query_tokens) + + file_data[file_path]["total_score"] += score + file_data[file_path]["match_count"] += 1 + file_data[file_path]["code_match_score"] = max( + file_data[file_path]["code_match_score"], code_match_score + ) + file_data[file_path]["matched_facts"].append(fact) + + return file_data + + def refresh_index(self) -> None: + """Convenience method for a full rebuild.""" + self.build_search_index(clear_cache=True) diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py new file mode 100644 index 0000000000000000000000000000000000000000..89e5e34e624c7212cffe1e19cb4e00bbcf2bfa5f --- /dev/null +++ b/crawl4ai/markdown_generation_strategy.py @@ -0,0 +1,225 @@ +from abc import ABC, abstractmethod +from typing import Optional, Dict, Any, Tuple +from .models import MarkdownGenerationResult +from .html2text import CustomHTML2Text +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter +import re +from urllib.parse import urljoin + +# Pre-compile the regex pattern +LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') + +def fast_urljoin(base: str, url: str) -> str: + """Fast URL joining for common cases.""" + if url.startswith(('http://', 'https://', 'mailto:', '//')): + return url + if url.startswith('/'): + # Handle absolute paths + if base.endswith('/'): + return base[:-1] + url + return base + url + return urljoin(base, url) + +class MarkdownGenerationStrategy(ABC): + """Abstract base class for markdown generation strategies.""" + def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): + self.content_filter = content_filter + self.options = options or {} + + @abstractmethod + def generate_markdown(self, + cleaned_html: str, + base_url: str = "", + html2text_options: Optional[Dict[str, Any]] = None, + content_filter: Optional[RelevantContentFilter] = None, + citations: bool = True, + **kwargs) -> MarkdownGenerationResult: + """Generate markdown from cleaned HTML.""" + pass + +class DefaultMarkdownGenerator(MarkdownGenerationStrategy): + """ + Default implementation of markdown generation strategy. + + How it works: + 1. Generate raw markdown from cleaned HTML. + 2. Convert links to citations. + 3. Generate fit markdown if content filter is provided. + 4. Return MarkdownGenerationResult. + + Args: + content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. + options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None. + + Returns: + MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. + """ + def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): + super().__init__(content_filter, options) + + def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: + """ + Convert links in markdown to citations. + + How it works: + 1. Find all links in the markdown. + 2. Convert links to citations. + 3. Return converted markdown and references markdown. + + Note: + This function uses a regex pattern to find links in markdown. + + Args: + markdown (str): Markdown text. + base_url (str): Base URL for URL joins. + + Returns: + Tuple[str, str]: Converted markdown and references markdown. + """ + link_map = {} + url_cache = {} # Cache for URL joins + parts = [] + last_end = 0 + counter = 1 + + for match in LINK_PATTERN.finditer(markdown): + parts.append(markdown[last_end:match.start()]) + text, url, title = match.groups() + + # Use cached URL if available, otherwise compute and cache + if base_url and not url.startswith(('http://', 'https://', 'mailto:')): + if url not in url_cache: + url_cache[url] = fast_urljoin(base_url, url) + url = url_cache[url] + + if url not in link_map: + desc = [] + if title: desc.append(title) + if text and text != title: desc.append(text) + link_map[url] = (counter, ": " + " - ".join(desc) if desc else "") + counter += 1 + + num = link_map[url][0] + parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]") + last_end = match.end() + + parts.append(markdown[last_end:]) + converted_text = ''.join(parts) + + # Pre-build reference strings + references = ["\n\n## References\n\n"] + references.extend( + f"⟨{num}⟩ {url}{desc}\n" + for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0]) + ) + + return converted_text, ''.join(references) + + def generate_markdown(self, + cleaned_html: str, + base_url: str = "", + html2text_options: Optional[Dict[str, Any]] = None, + options: Optional[Dict[str, Any]] = None, + content_filter: Optional[RelevantContentFilter] = None, + citations: bool = True, + **kwargs) -> MarkdownGenerationResult: + """ + Generate markdown with citations from cleaned HTML. + + How it works: + 1. Generate raw markdown from cleaned HTML. + 2. Convert links to citations. + 3. Generate fit markdown if content filter is provided. + 4. Return MarkdownGenerationResult. + + Args: + cleaned_html (str): Cleaned HTML content. + base_url (str): Base URL for URL joins. + html2text_options (Optional[Dict[str, Any]]): HTML2Text options. + options (Optional[Dict[str, Any]]): Additional options for markdown generation. + content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. + citations (bool): Whether to generate citations. + + Returns: + MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. + """ + try: + # Initialize HTML2Text with default options for better conversion + h = CustomHTML2Text(baseurl=base_url) + default_options = { + 'body_width': 0, # Disable text wrapping + 'ignore_emphasis': False, + 'ignore_links': False, + 'ignore_images': False, + 'protect_links': True, + 'single_line_break': True, + 'mark_code': True, + 'escape_snob': False + } + + # Update with custom options if provided + if html2text_options: + default_options.update(html2text_options) + elif options: + default_options.update(options) + elif self.options: + default_options.update(self.options) + + h.update_params(**default_options) + + # Ensure we have valid input + if not cleaned_html: + cleaned_html = "" + elif not isinstance(cleaned_html, str): + cleaned_html = str(cleaned_html) + + # Generate raw markdown + try: + raw_markdown = h.handle(cleaned_html) + except Exception as e: + raw_markdown = f"Error converting HTML to markdown: {str(e)}" + + raw_markdown = raw_markdown.replace(' ```', '```') + + # Convert links to citations + markdown_with_citations: str = raw_markdown + references_markdown: str = "" + if citations: + try: + markdown_with_citations, references_markdown = self.convert_links_to_citations( + raw_markdown, base_url + ) + except Exception as e: + markdown_with_citations = raw_markdown + references_markdown = f"Error generating citations: {str(e)}" + + # Generate fit markdown if content filter is provided + fit_markdown: Optional[str] = "" + filtered_html: Optional[str] = "" + if content_filter or self.content_filter: + try: + content_filter = content_filter or self.content_filter + filtered_html = content_filter.filter_content(cleaned_html) + filtered_html = '\n'.join('
    {}
    '.format(s) for s in filtered_html) + fit_markdown = h.handle(filtered_html) + except Exception as e: + fit_markdown = f"Error generating fit markdown: {str(e)}" + filtered_html = "" + + return MarkdownGenerationResult( + raw_markdown=raw_markdown or "", + markdown_with_citations=markdown_with_citations or "", + references_markdown=references_markdown or "", + fit_markdown=fit_markdown or "", + fit_html=filtered_html or "", + ) + except Exception as e: + # If anything fails, return empty strings with error message + error_msg = f"Error in markdown generation: {str(e)}" + return MarkdownGenerationResult( + raw_markdown=error_msg, + markdown_with_citations=error_msg, + references_markdown="", + fit_markdown="", + fit_html="", + ) diff --git a/crawl4ai/migrations.py b/crawl4ai/migrations.py new file mode 100644 index 0000000000000000000000000000000000000000..3386b0fb433b4ba116476ee225606eab2cb3a956 --- /dev/null +++ b/crawl4ai/migrations.py @@ -0,0 +1,168 @@ +import os +import asyncio +import logging +from pathlib import Path +import aiosqlite +from typing import Optional +import xxhash +import aiofiles +import shutil +import time +from datetime import datetime +from .async_logger import AsyncLogger, LogLevel + +# Initialize logger +logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) + +# logging.basicConfig(level=logging.INFO) +# logger = logging.getLogger(__name__) + +class DatabaseMigration: + def __init__(self, db_path: str): + self.db_path = db_path + self.content_paths = self._ensure_content_dirs(os.path.dirname(db_path)) + + def _ensure_content_dirs(self, base_path: str) -> dict: + dirs = { + 'html': 'html_content', + 'cleaned': 'cleaned_html', + 'markdown': 'markdown_content', + 'extracted': 'extracted_content', + 'screenshots': 'screenshots' + } + content_paths = {} + for key, dirname in dirs.items(): + path = os.path.join(base_path, dirname) + os.makedirs(path, exist_ok=True) + content_paths[key] = path + return content_paths + + def _generate_content_hash(self, content: str) -> str: + x = xxhash.xxh64() + x.update(content.encode()) + content_hash = x.hexdigest() + return content_hash + # return hashlib.sha256(content.encode()).hexdigest() + + async def _store_content(self, content: str, content_type: str) -> str: + if not content: + return "" + + content_hash = self._generate_content_hash(content) + file_path = os.path.join(self.content_paths[content_type], content_hash) + + if not os.path.exists(file_path): + async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: + await f.write(content) + + return content_hash + + async def migrate_database(self): + """Migrate existing database to file-based storage""" + # logger.info("Starting database migration...") + logger.info("Starting database migration...", tag="INIT") + + try: + async with aiosqlite.connect(self.db_path) as db: + # Get all rows + async with db.execute( + '''SELECT url, html, cleaned_html, markdown, + extracted_content, screenshot FROM crawled_data''' + ) as cursor: + rows = await cursor.fetchall() + + migrated_count = 0 + for row in rows: + url, html, cleaned_html, markdown, extracted_content, screenshot = row + + # Store content in files and get hashes + html_hash = await self._store_content(html, 'html') + cleaned_hash = await self._store_content(cleaned_html, 'cleaned') + markdown_hash = await self._store_content(markdown, 'markdown') + extracted_hash = await self._store_content(extracted_content, 'extracted') + screenshot_hash = await self._store_content(screenshot, 'screenshots') + + # Update database with hashes + await db.execute(''' + UPDATE crawled_data + SET html = ?, + cleaned_html = ?, + markdown = ?, + extracted_content = ?, + screenshot = ? + WHERE url = ? + ''', (html_hash, cleaned_hash, markdown_hash, + extracted_hash, screenshot_hash, url)) + + migrated_count += 1 + if migrated_count % 100 == 0: + logger.info(f"Migrated {migrated_count} records...", tag="INIT") + + + await db.commit() + logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE") + + except Exception as e: + # logger.error(f"Migration failed: {e}") + logger.error( + message="Migration failed: {error}", + tag="ERROR", + params={"error": str(e)} + ) + raise e + +async def backup_database(db_path: str) -> str: + """Create backup of existing database""" + if not os.path.exists(db_path): + logger.info("No existing database found. Skipping backup.", tag="INIT") + return None + + # Create backup with timestamp + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + backup_path = f"{db_path}.backup_{timestamp}" + + try: + # Wait for any potential write operations to finish + await asyncio.sleep(1) + + # Create backup + shutil.copy2(db_path, backup_path) + logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE") + return backup_path + except Exception as e: + # logger.error(f"Backup failed: {e}") + logger.error( + message="Migration failed: {error}", + tag="ERROR", + params={"error": str(e)} + ) + raise e + +async def run_migration(db_path: Optional[str] = None): + """Run database migration""" + if db_path is None: + db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db") + + if not os.path.exists(db_path): + logger.info("No existing database found. Skipping migration.", tag="INIT") + return + + # Create backup first + backup_path = await backup_database(db_path) + if not backup_path: + return + + migration = DatabaseMigration(db_path) + await migration.migrate_database() + +def main(): + """CLI entry point for migration""" + import argparse + parser = argparse.ArgumentParser(description='Migrate Crawl4AI database to file-based storage') + parser.add_argument('--db-path', help='Custom database path') + args = parser.parse_args() + + asyncio.run(run_migration(args.db_path)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..d1872d7e48decbb1860caf25684a6d27004f19e0 --- /dev/null +++ b/crawl4ai/model_loader.py @@ -0,0 +1,256 @@ +from functools import lru_cache +from pathlib import Path +import subprocess, os +import shutil +import tarfile +from .model_loader import * +import argparse +import urllib.request +from crawl4ai.config import MODEL_REPO_BRANCH +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +@lru_cache() +def get_available_memory(device): + import torch + if device.type == 'cuda': + return torch.cuda.get_device_properties(device).total_memory + elif device.type == 'mps': + return 48 * 1024 ** 3 # Assuming 8GB for MPS, as a conservative estimate + else: + return 0 + +@lru_cache() +def calculate_batch_size(device): + available_memory = get_available_memory(device) + + if device.type == 'cpu': + return 16 + elif device.type in ['cuda', 'mps']: + # Adjust these thresholds based on your model size and available memory + if available_memory >= 31 * 1024 ** 3: # > 32GB + return 256 + elif available_memory >= 15 * 1024 ** 3: # > 16GB to 32GB + return 128 + elif available_memory >= 8 * 1024 ** 3: # 8GB to 16GB + return 64 + else: + return 32 + else: + return 16 # Default batch size + +@lru_cache() +def get_device(): + import torch + if torch.cuda.is_available(): + device = torch.device('cuda') + elif torch.backends.mps.is_available(): + device = torch.device('mps') + else: + device = torch.device('cpu') + return device + +def set_model_device(model): + device = get_device() + model.to(device) + return model, device + +@lru_cache() +def get_home_folder(): + home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") + os.makedirs(home_folder, exist_ok=True) + os.makedirs(f"{home_folder}/cache", exist_ok=True) + os.makedirs(f"{home_folder}/models", exist_ok=True) + return home_folder + +@lru_cache() +def load_bert_base_uncased(): + from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None) + model = BertModel.from_pretrained('bert-base-uncased', resume_download=None) + model.eval() + model, device = set_model_device(model) + return tokenizer, model + +@lru_cache() +def load_HF_embedding_model(model_name="BAAI/bge-small-en-v1.5") -> tuple: + """Load the Hugging Face model for embedding. + + Args: + model_name (str, optional): The model name to load. Defaults to "BAAI/bge-small-en-v1.5". + + Returns: + tuple: The tokenizer and model. + """ + from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel + tokenizer = AutoTokenizer.from_pretrained(model_name, resume_download=None) + model = AutoModel.from_pretrained(model_name, resume_download=None) + model.eval() + model, device = set_model_device(model) + return tokenizer, model + +@lru_cache() +def load_text_classifier(): + from transformers import AutoTokenizer, AutoModelForSequenceClassification + from transformers import pipeline + import torch + + tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news") + model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news") + model.eval() + model, device = set_model_device(model) + pipe = pipeline("text-classification", model=model, tokenizer=tokenizer) + return pipe + +@lru_cache() +def load_text_multilabel_classifier(): + from transformers import AutoModelForSequenceClassification, AutoTokenizer + import numpy as np + from scipy.special import expit + import torch + + # # Check for available device: CUDA, MPS (for Apple Silicon), or CPU + # if torch.cuda.is_available(): + # device = torch.device("cuda") + # elif torch.backends.mps.is_available(): + # device = torch.device("mps") + # else: + # device = torch.device("cpu") + # # return load_spacy_model(), torch.device("cpu") + + + MODEL = "cardiffnlp/tweet-topic-21-multi" + tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None) + model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None) + model.eval() + model, device = set_model_device(model) + class_mapping = model.config.id2label + + def _classifier(texts, threshold=0.5, max_length=64): + tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length) + tokens = {key: val.to(device) for key, val in tokens.items()} # Move tokens to the selected device + + with torch.no_grad(): + output = model(**tokens) + + scores = output.logits.detach().cpu().numpy() + scores = expit(scores) + predictions = (scores >= threshold) * 1 + + batch_labels = [] + for prediction in predictions: + labels = [class_mapping[i] for i, value in enumerate(prediction) if value == 1] + batch_labels.append(labels) + + return batch_labels + + return _classifier, device + +@lru_cache() +def load_nltk_punkt(): + import nltk + try: + nltk.data.find('tokenizers/punkt') + except LookupError: + nltk.download('punkt') + return nltk.data.find('tokenizers/punkt') + +@lru_cache() +def load_spacy_model(): + import spacy + name = "models/reuters" + home_folder = get_home_folder() + model_folder = Path(home_folder) / name + + # Check if the model directory already exists + if not (model_folder.exists() and any(model_folder.iterdir())): + repo_url = "https://github.com/unclecode/crawl4ai.git" + branch = MODEL_REPO_BRANCH + repo_folder = Path(home_folder) / "crawl4ai" + + print("[LOG] ⏬ Downloading Spacy model for the first time...") + + # Remove existing repo folder if it exists + if repo_folder.exists(): + try: + shutil.rmtree(repo_folder) + if model_folder.exists(): + shutil.rmtree(model_folder) + except PermissionError: + print("[WARNING] Unable to remove existing folders. Please manually delete the following folders and try again:") + print(f"- {repo_folder}") + print(f"- {model_folder}") + return None + + try: + # Clone the repository + subprocess.run( + ["git", "clone", "-b", branch, repo_url, str(repo_folder)], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=True + ) + + # Create the models directory if it doesn't exist + models_folder = Path(home_folder) / "models" + models_folder.mkdir(parents=True, exist_ok=True) + + # Copy the reuters model folder to the models directory + source_folder = repo_folder / "models" / "reuters" + shutil.copytree(source_folder, model_folder) + + # Remove the cloned repository + shutil.rmtree(repo_folder) + + print("[LOG] ✅ Spacy Model downloaded successfully") + except subprocess.CalledProcessError as e: + print(f"An error occurred while cloning the repository: {e}") + return None + except Exception as e: + print(f"An error occurred: {e}") + return None + + try: + return spacy.load(str(model_folder)) + except Exception as e: + print(f"Error loading spacy model: {e}") + return None + +def download_all_models(remove_existing=False): + """Download all models required for Crawl4AI.""" + if remove_existing: + print("[LOG] Removing existing models...") + home_folder = get_home_folder() + model_folders = [ + os.path.join(home_folder, "models/reuters"), + os.path.join(home_folder, "models"), + ] + for folder in model_folders: + if Path(folder).exists(): + shutil.rmtree(folder) + print("[LOG] Existing models removed.") + + # Load each model to trigger download + # print("[LOG] Downloading BERT Base Uncased...") + # load_bert_base_uncased() + # print("[LOG] Downloading BGE Small EN v1.5...") + # load_bge_small_en_v1_5() + # print("[LOG] Downloading ONNX model...") + # load_onnx_all_MiniLM_l6_v2() + print("[LOG] Downloading text classifier...") + _, device = load_text_multilabel_classifier() + print(f"[LOG] Text classifier loaded on {device}") + print("[LOG] Downloading custom NLTK Punkt model...") + load_nltk_punkt() + print("[LOG] ✅ All models downloaded successfully.") + +def main(): + print("[LOG] Welcome to the Crawl4AI Model Downloader!") + print("[LOG] This script will download all the models required for Crawl4AI.") + parser = argparse.ArgumentParser(description="Crawl4AI Model Downloader") + parser.add_argument('--remove-existing', action='store_true', help="Remove existing models before downloading") + args = parser.parse_args() + + download_all_models(remove_existing=args.remove_existing) + +if __name__ == "__main__": + main() diff --git a/crawl4ai/models.py b/crawl4ai/models.py new file mode 100644 index 0000000000000000000000000000000000000000..6fb362a349fae94fdf96f8f5b1518371930b6e7b --- /dev/null +++ b/crawl4ai/models.py @@ -0,0 +1,61 @@ +from pydantic import BaseModel, HttpUrl +from typing import List, Dict, Optional, Callable, Awaitable, Union, Any +from dataclasses import dataclass +from .ssl_certificate import SSLCertificate + +@dataclass +class TokenUsage: + completion_tokens: int = 0 + prompt_tokens: int = 0 + total_tokens: int = 0 + completion_tokens_details: Optional[dict] = None + prompt_tokens_details: Optional[dict] = None + + +class UrlModel(BaseModel): + url: HttpUrl + forced: bool = False + +class MarkdownGenerationResult(BaseModel): + raw_markdown: str + markdown_with_citations: str + references_markdown: str + fit_markdown: Optional[str] = None + fit_html: Optional[str] = None + +class CrawlResult(BaseModel): + url: str + html: str + success: bool + cleaned_html: Optional[str] = None + media: Dict[str, List[Dict]] = {} + links: Dict[str, List[Dict]] = {} + downloaded_files: Optional[List[str]] = None + screenshot: Optional[str] = None + pdf : Optional[bytes] = None + markdown: Optional[Union[str, MarkdownGenerationResult]] = None + markdown_v2: Optional[MarkdownGenerationResult] = None + fit_markdown: Optional[str] = None + fit_html: Optional[str] = None + extracted_content: Optional[str] = None + metadata: Optional[dict] = None + error_message: Optional[str] = None + session_id: Optional[str] = None + response_headers: Optional[dict] = None + status_code: Optional[int] = None + ssl_certificate: Optional[SSLCertificate] = None + class Config: + arbitrary_types_allowed = True + +class AsyncCrawlResponse(BaseModel): + html: str + response_headers: Dict[str, str] + status_code: int + screenshot: Optional[str] = None + pdf_data: Optional[bytes] = None + get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None + downloaded_files: Optional[List[str]] = None + ssl_certificate: Optional[SSLCertificate] = None + + class Config: + arbitrary_types_allowed = True diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py new file mode 100644 index 0000000000000000000000000000000000000000..7a963e6d9215da02def72b686cd87abb8ebfed5b --- /dev/null +++ b/crawl4ai/prompts.py @@ -0,0 +1,204 @@ +PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage: +{URL} + +And here is the cleaned HTML content of that webpage: + +{HTML} + + +Your task is to break down this HTML content into semantically relevant blocks, and for each block, generate a JSON object with the following keys: + +- index: an integer representing the index of the block in the content +- tags: a list of semantic tags that are relevant to the content of the block +- content: a list of strings containing the text content of the block +- questions: a list of 3 questions that a user may ask about the content in this block + +To generate the JSON objects: + +1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks. + +2. For each block: + a. Assign it an index based on its order in the content. + b. Analyze the content and generate a list of relevant semantic tags that describe what the block is about. + c. Extract the text content, clean it up if needed, and store it as a list of strings in the "content" field. + d. Come up with 3 questions that a user might ask about this specific block of content, based on the tags and content. The questions should be relevant and answerable by the content in the block. + +3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content. + +4. Double-check that each JSON object includes all required keys (index, tags, content, questions) and that the values are in the expected format (integer, list of strings, etc.). + +5. Make sure the generated JSON is complete and parsable, with no errors or omissions. + +6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues. + +Please provide your output within tags, like this: + + +[{ + "index": 0, + "tags": ["introduction", "overview"], + "content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."], + "questions": [ + "What is the main topic of this article?", + "What can I expect to learn from reading this article?", + "Is this article suitable for beginners or experts in the field?" + ] +}, +{ + "index": 1, + "tags": ["history", "background"], + "content": ["This is the second paragraph, which delves into the history and background of the topic.", + "It provides context and sets the stage for the rest of the article."], + "questions": [ + "What historical events led to the development of this topic?", + "How has the understanding of this topic evolved over time?", + "What are some key milestones in the history of this topic?" + ] +}] + + +Remember, the output should be a complete, parsable JSON wrapped in tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order.""" + +PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage: +{URL} + +And here is the cleaned HTML content of that webpage: + +{HTML} + + +Your task is to break down this HTML content into semantically relevant blocks, and for each block, generate a JSON object with the following keys: + +- index: an integer representing the index of the block in the content +- content: a list of strings containing the text content of the block + +To generate the JSON objects: + +1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks. + +2. For each block: + a. Assign it an index based on its order in the content. + b. Analyze the content and generate ONE semantic tag that describe what the block is about. + c. Extract the text content, EXACTLY SAME AS THE GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field. + +3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content. + +4. Double-check that each JSON object includes all required keys (index, tag, content) and that the values are in the expected format (integer, list of strings, etc.). + +5. Make sure the generated JSON is complete and parsable, with no errors or omissions. + +6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues. + +7. Never alter the extracted content, just copy and paste it as it is. + +Please provide your output within tags, like this: + + +[{ + "index": 0, + "tags": ["introduction"], + "content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."] +}, +{ + "index": 1, + "tags": ["background"], + "content": ["This is the second paragraph, which delves into the history and background of the topic.", + "It provides context and sets the stage for the rest of the article."] +}] + + +Remember, the output should be a complete, parsable JSON wrapped in tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order.""" + +PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION = """Here is the URL of the webpage: +{URL} + +And here is the cleaned HTML content of that webpage: + +{HTML} + + +Your task is to break down this HTML content into semantically relevant blocks, following the provided user's REQUEST, and for each block, generate a JSON object with the following keys: + +- index: an integer representing the index of the block in the content +- content: a list of strings containing the text content of the block + +This is the user's REQUEST, pay attention to it: + +{REQUEST} + + +To generate the JSON objects: + +1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks. + +2. For each block: + a. Assign it an index based on its order in the content. + b. Analyze the content and generate ONE semantic tag that describe what the block is about. + c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field. + +3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content. + +4. Double-check that each JSON object includes all required keys (index, tag, content) and that the values are in the expected format (integer, list of strings, etc.). + +5. Make sure the generated JSON is complete and parsable, with no errors or omissions. + +6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues. + +7. Never alter the extracted content, just copy and paste it as it is. + +Please provide your output within tags, like this: + + +[{ + "index": 0, + "tags": ["introduction"], + "content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."] +}, +{ + "index": 1, + "tags": ["background"], + "content": ["This is the second paragraph, which delves into the history and background of the topic.", + "It provides context and sets the stage for the rest of the article."] +}] + + +**Make sure to follow the user instruction to extract blocks aligin with the instruction.** + +Remember, the output should be a complete, parsable JSON wrapped in tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order.""" + +PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION = """Here is the content from the URL: +{URL} + + +{HTML} + + +The user has made the following request for what information to extract from the above content: + + +{REQUEST} + + + +{SCHEMA} + + +Please carefully read the URL content and the user's request. If the user provided a desired JSON schema in the above, extract the requested information from the URL content according to that schema. If no schema was provided, infer an appropriate JSON schema based on the user's request that will best capture the key information they are looking for. + +Extraction instructions: +Return the extracted information as a list of JSON objects, with each object in the list corresponding to a block of content from the URL, in the same order as it appears on the page. Wrap the entire JSON list in ... XML tags. + +Quality Reflection: +Before outputting your final answer, double check that the JSON you are returning is complete, containing all the information requested by the user, and is valid JSON that could be parsed by json.loads() with no errors or omissions. The outputted JSON objects should fully match the schema, either provided or inferred. + +Quality Score: +After reflecting, score the quality and completeness of the JSON data you are about to return on a scale of 1 to 5. Write the score inside tags. + +Avoid Common Mistakes: +- Do NOT add any comments using "//" or "#" in the JSON output. It causes parsing errors. +- Make sure the JSON is properly formatted with curly braces, square brackets, and commas in the right places. +- Do not miss closing tag at the end of the JSON output. +- Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format. + +Result +Output the final list of JSON objects, wrapped in ... XML tags. Make sure to close the tag properly.""" diff --git a/crawl4ai/ssl_certificate.py b/crawl4ai/ssl_certificate.py new file mode 100644 index 0000000000000000000000000000000000000000..97529e3e1d4aa358003b4cca3b7c1ee8929bd852 --- /dev/null +++ b/crawl4ai/ssl_certificate.py @@ -0,0 +1,181 @@ +"""SSL Certificate class for handling certificate operations.""" + +import ssl +import socket +import base64 +import json +from typing import Dict, Any, Optional +from urllib.parse import urlparse +import OpenSSL.crypto +from pathlib import Path + + +class SSLCertificate: + """ + A class representing an SSL certificate with methods to export in various formats. + + Attributes: + cert_info (Dict[str, Any]): The certificate information. + + Methods: + from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL. + from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file. + from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data. + export_as_pem() -> str: Export the certificate as PEM format. + export_as_der() -> bytes: Export the certificate as DER format. + export_as_json() -> Dict[str, Any]: Export the certificate as JSON format. + export_as_text() -> str: Export the certificate as text format. + """ + def __init__(self, cert_info: Dict[str, Any]): + self._cert_info = self._decode_cert_data(cert_info) + + @staticmethod + def from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: + """ + Create SSLCertificate instance from a URL. + + Args: + url (str): URL of the website. + timeout (int): Timeout for the connection (default: 10). + + Returns: + Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise. + """ + try: + hostname = urlparse(url).netloc + if ':' in hostname: + hostname = hostname.split(':')[0] + + context = ssl.create_default_context() + with socket.create_connection((hostname, 443), timeout=timeout) as sock: + with context.wrap_socket(sock, server_hostname=hostname) as ssock: + cert_binary = ssock.getpeercert(binary_form=True) + x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_ASN1, cert_binary) + + cert_info = { + "subject": dict(x509.get_subject().get_components()), + "issuer": dict(x509.get_issuer().get_components()), + "version": x509.get_version(), + "serial_number": hex(x509.get_serial_number()), + "not_before": x509.get_notBefore(), + "not_after": x509.get_notAfter(), + "fingerprint": x509.digest("sha256").hex(), + "signature_algorithm": x509.get_signature_algorithm(), + "raw_cert": base64.b64encode(cert_binary) + } + + # Add extensions + extensions = [] + for i in range(x509.get_extension_count()): + ext = x509.get_extension(i) + extensions.append({ + "name": ext.get_short_name(), + "value": str(ext) + }) + cert_info["extensions"] = extensions + + return SSLCertificate(cert_info) + + except Exception as e: + return None + + @staticmethod + def _decode_cert_data(data: Any) -> Any: + """Helper method to decode bytes in certificate data.""" + if isinstance(data, bytes): + return data.decode('utf-8') + elif isinstance(data, dict): + return { + (k.decode('utf-8') if isinstance(k, bytes) else k): SSLCertificate._decode_cert_data(v) + for k, v in data.items() + } + elif isinstance(data, list): + return [SSLCertificate._decode_cert_data(item) for item in data] + return data + + def to_json(self, filepath: Optional[str] = None) -> Optional[str]: + """ + Export certificate as JSON. + + Args: + filepath (Optional[str]): Path to save the JSON file (default: None). + + Returns: + Optional[str]: JSON string if successful, None otherwise. + """ + json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False) + if filepath: + Path(filepath).write_text(json_str, encoding='utf-8') + return None + return json_str + + def to_pem(self, filepath: Optional[str] = None) -> Optional[str]: + """ + Export certificate as PEM. + + Args: + filepath (Optional[str]): Path to save the PEM file (default: None). + + Returns: + Optional[str]: PEM string if successful, None otherwise. + """ + try: + x509 = OpenSSL.crypto.load_certificate( + OpenSSL.crypto.FILETYPE_ASN1, + base64.b64decode(self._cert_info['raw_cert']) + ) + pem_data = OpenSSL.crypto.dump_certificate( + OpenSSL.crypto.FILETYPE_PEM, + x509 + ).decode('utf-8') + + if filepath: + Path(filepath).write_text(pem_data, encoding='utf-8') + return None + return pem_data + except Exception as e: + return None + + def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]: + """ + Export certificate as DER. + + Args: + filepath (Optional[str]): Path to save the DER file (default: None). + + Returns: + Optional[bytes]: DER bytes if successful, None otherwise. + """ + try: + der_data = base64.b64decode(self._cert_info['raw_cert']) + if filepath: + Path(filepath).write_bytes(der_data) + return None + return der_data + except Exception: + return None + + @property + def issuer(self) -> Dict[str, str]: + """Get certificate issuer information.""" + return self._cert_info.get('issuer', {}) + + @property + def subject(self) -> Dict[str, str]: + """Get certificate subject information.""" + return self._cert_info.get('subject', {}) + + @property + def valid_from(self) -> str: + """Get certificate validity start date.""" + return self._cert_info.get('not_before', '') + + @property + def valid_until(self) -> str: + """Get certificate validity end date.""" + return self._cert_info.get('not_after', '') + + @property + def fingerprint(self) -> str: + """Get certificate fingerprint.""" + return self._cert_info.get('fingerprint', '') diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..6679bb1b25a2f020b403eb230a8a640194062f19 --- /dev/null +++ b/crawl4ai/user_agent_generator.py @@ -0,0 +1,305 @@ +import random +from typing import Optional, Literal, List, Dict, Tuple +import re + + +class UserAgentGenerator: + """ + Generate random user agents with specified constraints. + + Attributes: + desktop_platforms (dict): A dictionary of possible desktop platforms and their corresponding user agent strings. + mobile_platforms (dict): A dictionary of possible mobile platforms and their corresponding user agent strings. + browser_combinations (dict): A dictionary of possible browser combinations and their corresponding user agent strings. + rendering_engines (dict): A dictionary of possible rendering engines and their corresponding user agent strings. + chrome_versions (list): A list of possible Chrome browser versions. + firefox_versions (list): A list of possible Firefox browser versions. + edge_versions (list): A list of possible Edge browser versions. + safari_versions (list): A list of possible Safari browser versions. + ios_versions (list): A list of possible iOS browser versions. + android_versions (list): A list of possible Android browser versions. + + Methods: + generate_user_agent( + platform: Literal["desktop", "mobile"] = "desktop", + browser: str = "chrome", + rendering_engine: str = "chrome_webkit", + chrome_version: Optional[str] = None, + firefox_version: Optional[str] = None, + edge_version: Optional[str] = None, + safari_version: Optional[str] = None, + ios_version: Optional[str] = None, + android_version: Optional[str] = None + ): Generates a random user agent string based on the specified parameters. + """ + def __init__(self): + # Previous platform definitions remain the same... + self.desktop_platforms = { + "windows": { + "10_64": "(Windows NT 10.0; Win64; x64)", + "10_32": "(Windows NT 10.0; WOW64)", + }, + "macos": { + "intel": "(Macintosh; Intel Mac OS X 10_15_7)", + "newer": "(Macintosh; Intel Mac OS X 10.15; rv:109.0)", + }, + "linux": { + "generic": "(X11; Linux x86_64)", + "ubuntu": "(X11; Ubuntu; Linux x86_64)", + "chrome_os": "(X11; CrOS x86_64 14541.0.0)", + } + } + + self.mobile_platforms = { + "android": { + "samsung": "(Linux; Android 13; SM-S901B)", + "pixel": "(Linux; Android 12; Pixel 6)", + "oneplus": "(Linux; Android 13; OnePlus 9 Pro)", + "xiaomi": "(Linux; Android 12; M2102J20SG)", + }, + "ios": { + "iphone": "(iPhone; CPU iPhone OS 16_5 like Mac OS X)", + "ipad": "(iPad; CPU OS 16_5 like Mac OS X)", + } + } + + # Browser Combinations + self.browser_combinations = { + 1: [ + ["chrome"], + ["firefox"], + ["safari"], + ["edge"] + ], + 2: [ + ["gecko", "firefox"], + ["chrome", "safari"], + ["webkit", "safari"] + ], + 3: [ + ["chrome", "safari", "edge"], + ["webkit", "chrome", "safari"] + ] + } + + # Rendering Engines with versions + self.rendering_engines = { + "chrome_webkit": "AppleWebKit/537.36", + "safari_webkit": "AppleWebKit/605.1.15", + "gecko": [ # Added Gecko versions + "Gecko/20100101", + "Gecko/20100101", # Firefox usually uses this constant version + "Gecko/2010010", + ] + } + + # Browser Versions + self.chrome_versions = [ + "Chrome/119.0.6045.199", + "Chrome/118.0.5993.117", + "Chrome/117.0.5938.149", + "Chrome/116.0.5845.187", + "Chrome/115.0.5790.171", + ] + + self.edge_versions = [ + "Edg/119.0.2151.97", + "Edg/118.0.2088.76", + "Edg/117.0.2045.47", + "Edg/116.0.1938.81", + "Edg/115.0.1901.203", + ] + + self.safari_versions = [ + "Safari/537.36", # For Chrome-based + "Safari/605.1.15", + "Safari/604.1", + "Safari/602.1", + "Safari/601.5.17", + ] + + # Added Firefox versions + self.firefox_versions = [ + "Firefox/119.0", + "Firefox/118.0.2", + "Firefox/117.0.1", + "Firefox/116.0", + "Firefox/115.0.3", + "Firefox/114.0.2", + "Firefox/113.0.1", + "Firefox/112.0", + "Firefox/111.0.1", + "Firefox/110.0", + ] + + def get_browser_stack(self, num_browsers: int = 1) -> List[str]: + """ + Get a valid combination of browser versions. + + How it works: + 1. Check if the number of browsers is supported. + 2. Randomly choose a combination of browsers. + 3. Iterate through the combination and add browser versions. + 4. Return the browser stack. + + Args: + num_browsers: Number of browser specifications (1-3) + + Returns: + List[str]: A list of browser versions. + """ + if num_browsers not in self.browser_combinations: + raise ValueError(f"Unsupported number of browsers: {num_browsers}") + + combination = random.choice(self.browser_combinations[num_browsers]) + browser_stack = [] + + for browser in combination: + if browser == "chrome": + browser_stack.append(random.choice(self.chrome_versions)) + elif browser == "firefox": + browser_stack.append(random.choice(self.firefox_versions)) + elif browser == "safari": + browser_stack.append(random.choice(self.safari_versions)) + elif browser == "edge": + browser_stack.append(random.choice(self.edge_versions)) + elif browser == "gecko": + browser_stack.append(random.choice(self.rendering_engines["gecko"])) + elif browser == "webkit": + browser_stack.append(self.rendering_engines["chrome_webkit"]) + + return browser_stack + + def generate(self, + device_type: Optional[Literal['desktop', 'mobile']] = None, + os_type: Optional[str] = None, + device_brand: Optional[str] = None, + browser_type: Optional[Literal['chrome', 'edge', 'safari', 'firefox']] = None, + num_browsers: int = 3) -> str: + """ + Generate a random user agent with specified constraints. + + Args: + device_type: 'desktop' or 'mobile' + os_type: 'windows', 'macos', 'linux', 'android', 'ios' + device_brand: Specific device brand + browser_type: 'chrome', 'edge', 'safari', or 'firefox' + num_browsers: Number of browser specifications (1-3) + """ + # Get platform string + platform = self.get_random_platform(device_type, os_type, device_brand) + + # Start with Mozilla + components = ["Mozilla/5.0", platform] + + # Add browser stack + browser_stack = self.get_browser_stack(num_browsers) + + # Add appropriate legacy token based on browser stack + if "Firefox" in str(browser_stack): + components.append(random.choice(self.rendering_engines["gecko"])) + elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack): + components.append(self.rendering_engines["chrome_webkit"]) + components.append("(KHTML, like Gecko)") + + # Add browser versions + components.extend(browser_stack) + + return " ".join(components) + + def generate_with_client_hints(self, **kwargs) -> Tuple[str, str]: + """Generate both user agent and matching client hints""" + user_agent = self.generate(**kwargs) + client_hints = self.generate_client_hints(user_agent) + return user_agent, client_hints + + def get_random_platform(self, device_type, os_type, device_brand): + """Helper method to get random platform based on constraints""" + platforms = self.desktop_platforms if device_type == 'desktop' else \ + self.mobile_platforms if device_type == 'mobile' else \ + {**self.desktop_platforms, **self.mobile_platforms} + + if os_type: + for platform_group in [self.desktop_platforms, self.mobile_platforms]: + if os_type in platform_group: + platforms = {os_type: platform_group[os_type]} + break + + os_key = random.choice(list(platforms.keys())) + if device_brand and device_brand in platforms[os_key]: + return platforms[os_key][device_brand] + return random.choice(list(platforms[os_key].values())) + + def parse_user_agent(self, user_agent: str) -> Dict[str, str]: + """Parse a user agent string to extract browser and version information""" + browsers = { + 'chrome': r'Chrome/(\d+)', + 'edge': r'Edg/(\d+)', + 'safari': r'Version/(\d+)', + 'firefox': r'Firefox/(\d+)' + } + + result = {} + for browser, pattern in browsers.items(): + match = re.search(pattern, user_agent) + if match: + result[browser] = match.group(1) + + return result + + def generate_client_hints(self, user_agent: str) -> str: + """Generate Sec-CH-UA header value based on user agent string""" + browsers = self.parse_user_agent(user_agent) + + # Client hints components + hints = [] + + # Handle different browser combinations + if 'chrome' in browsers: + hints.append(f'"Chromium";v="{browsers["chrome"]}"') + hints.append('"Not_A Brand";v="8"') + + if 'edge' in browsers: + hints.append(f'"Microsoft Edge";v="{browsers["edge"]}"') + else: + hints.append(f'"Google Chrome";v="{browsers["chrome"]}"') + + elif 'firefox' in browsers: + # Firefox doesn't typically send Sec-CH-UA + return '""' + + elif 'safari' in browsers: + # Safari's format for client hints + hints.append(f'"Safari";v="{browsers["safari"]}"') + hints.append('"Not_A Brand";v="8"') + + return ', '.join(hints) + +# Example usage: +if __name__ == "__main__": + generator = UserAgentGenerator() + print(generator.generate()) + + print("\nSingle browser (Chrome):") + print(generator.generate(num_browsers=1, browser_type='chrome')) + + print("\nTwo browsers (Gecko/Firefox):") + print(generator.generate(num_browsers=2)) + + print("\nThree browsers (Chrome/Safari/Edge):") + print(generator.generate(num_browsers=3)) + + print("\nFirefox on Linux:") + print(generator.generate( + device_type='desktop', + os_type='linux', + browser_type='firefox', + num_browsers=2 + )) + + print("\nChrome/Safari/Edge on Windows:") + print(generator.generate( + device_type='desktop', + os_type='windows', + num_browsers=3 + )) \ No newline at end of file diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6fd7429f19df0ac1700dbac7b760cdc125697e14 --- /dev/null +++ b/crawl4ai/utils.py @@ -0,0 +1,1660 @@ +import time +from urllib.parse import urlparse +from concurrent.futures import ThreadPoolExecutor, as_completed +from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString +import json +import html +import re +import os +import platform +from .prompts import PROMPT_EXTRACT_BLOCKS +from .config import * +from pathlib import Path +from typing import Dict, Any +from urllib.parse import urljoin +import requests +from requests.exceptions import InvalidSchema +from typing import Optional, Tuple, Dict, Any +import xxhash +from colorama import Fore, Style, init +import textwrap +import cProfile +import pstats +from functools import wraps +import asyncio + + +class InvalidCSSSelectorError(Exception): + pass + +def create_box_message(message: str, type: str = "info", width: int = 120, add_newlines: bool = True, double_line: bool = False) -> str: + """ + Create a styled message box with colored borders and formatted text. + + How it works: + 1. Determines box style and colors based on the message type (e.g., info, warning). + 2. Wraps text to fit within the specified width. + 3. Constructs a box using characters (single or double lines) with appropriate formatting. + 4. Adds optional newlines before and after the box. + + Args: + message (str): The message to display inside the box. + type (str): Type of the message (e.g., "info", "warning", "error", "success"). Defaults to "info". + width (int): Width of the box. Defaults to 120. + add_newlines (bool): Whether to add newlines before and after the box. Defaults to True. + double_line (bool): Whether to use double lines for the box border. Defaults to False. + + Returns: + str: A formatted string containing the styled message box. + """ + + init() + + # Define border and text colors for different types + styles = { + "warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"), + "info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"), + "success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"), + "error": (Fore.RED, Fore.LIGHTRED_EX, "×"), + } + + border_color, text_color, prefix = styles.get(type.lower(), styles["info"]) + + # Define box characters based on line style + box_chars = { + "single": ("─", "│", "┌", "┐", "└", "┘"), + "double": ("═", "║", "╔", "╗", "╚", "╝") + } + line_style = "double" if double_line else "single" + h_line, v_line, tl, tr, bl, br = box_chars[line_style] + + # Process lines with lighter text color + formatted_lines = [] + raw_lines = message.split('\n') + + if raw_lines: + first_line = f"{prefix} {raw_lines[0].strip()}" + wrapped_first = textwrap.fill(first_line, width=width-4) + formatted_lines.extend(wrapped_first.split('\n')) + + for line in raw_lines[1:]: + if line.strip(): + wrapped = textwrap.fill(f" {line.strip()}", width=width-4) + formatted_lines.extend(wrapped.split('\n')) + else: + formatted_lines.append("") + + # Create the box with colored borders and lighter text + horizontal_line = h_line * (width - 1) + box = [ + f"{border_color}{tl}{horizontal_line}{tr}", + *[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines], + f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}" + ] + + result = "\n".join(box) + if add_newlines: + result = f"\n{result}\n" + + return result + +def calculate_semaphore_count(): + """ + Calculate the optimal semaphore count based on system resources. + + How it works: + 1. Determines the number of CPU cores and total system memory. + 2. Sets a base count as half of the available CPU cores. + 3. Limits the count based on memory, assuming 2GB per semaphore instance. + 4. Returns the minimum value between CPU and memory-based limits. + + Returns: + int: The calculated semaphore count. + """ + + cpu_count = os.cpu_count() + memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB + base_count = max(1, cpu_count // 2) + memory_based_cap = int(memory_gb / 2) # Assume 2GB per instance + return min(base_count, memory_based_cap) + +def get_system_memory(): + """ + Get the total system memory in bytes. + + How it works: + 1. Detects the operating system. + 2. Reads memory information from system-specific commands or files. + 3. Converts the memory to bytes for uniformity. + + Returns: + int: The total system memory in bytes. + + Raises: + OSError: If the operating system is unsupported. + """ + + system = platform.system() + if system == "Linux": + with open('/proc/meminfo', 'r') as mem: + for line in mem: + if line.startswith('MemTotal:'): + return int(line.split()[1]) * 1024 # Convert KB to bytes + elif system == "Darwin": # macOS + import subprocess + output = subprocess.check_output(['sysctl', '-n', 'hw.memsize']).decode('utf-8') + return int(output.strip()) + elif system == "Windows": + import ctypes + kernel32 = ctypes.windll.kernel32 + c_ulonglong = ctypes.c_ulonglong + class MEMORYSTATUSEX(ctypes.Structure): + _fields_ = [ + ('dwLength', ctypes.c_ulong), + ('dwMemoryLoad', ctypes.c_ulong), + ('ullTotalPhys', c_ulonglong), + ('ullAvailPhys', c_ulonglong), + ('ullTotalPageFile', c_ulonglong), + ('ullAvailPageFile', c_ulonglong), + ('ullTotalVirtual', c_ulonglong), + ('ullAvailVirtual', c_ulonglong), + ('ullAvailExtendedVirtual', c_ulonglong), + ] + memoryStatus = MEMORYSTATUSEX() + memoryStatus.dwLength = ctypes.sizeof(MEMORYSTATUSEX) + kernel32.GlobalMemoryStatusEx(ctypes.byref(memoryStatus)) + return memoryStatus.ullTotalPhys + else: + raise OSError("Unsupported operating system") + +def get_home_folder(): + """ + Get or create the home folder for Crawl4AI configuration and cache. + + How it works: + 1. Uses environment variables or defaults to the user's home directory. + 2. Creates `.crawl4ai` and its subdirectories (`cache`, `models`) if they don't exist. + 3. Returns the path to the home folder. + + Returns: + str: The path to the Crawl4AI home folder. + """ + + home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai") + os.makedirs(home_folder, exist_ok=True) + os.makedirs(f"{home_folder}/cache", exist_ok=True) + os.makedirs(f"{home_folder}/models", exist_ok=True) + return home_folder + +def beautify_html(escaped_html): + """ + Beautifies an escaped HTML string. + + Parameters: + escaped_html (str): A string containing escaped HTML. + + Returns: + str: A beautifully formatted HTML string. + """ + # Unescape the HTML string + unescaped_html = html.unescape(escaped_html) + + # Use BeautifulSoup to parse and prettify the HTML + soup = BeautifulSoup(unescaped_html, 'html.parser') + pretty_html = soup.prettify() + + return pretty_html + +def split_and_parse_json_objects(json_string): + """ + Splits a JSON string which is a list of objects and tries to parse each object. + + Parameters: + json_string (str): A string representation of a list of JSON objects, e.g., '[{...}, {...}, ...]'. + + Returns: + tuple: A tuple containing two lists: + - First list contains all successfully parsed JSON objects. + - Second list contains the string representations of all segments that couldn't be parsed. + """ + # Trim the leading '[' and trailing ']' + if json_string.startswith('[') and json_string.endswith(']'): + json_string = json_string[1:-1].strip() + + # Split the string into segments that look like individual JSON objects + segments = [] + depth = 0 + start_index = 0 + + for i, char in enumerate(json_string): + if char == '{': + if depth == 0: + start_index = i + depth += 1 + elif char == '}': + depth -= 1 + if depth == 0: + segments.append(json_string[start_index:i+1]) + + # Try parsing each segment + parsed_objects = [] + unparsed_segments = [] + + for segment in segments: + try: + obj = json.loads(segment) + parsed_objects.append(obj) + except json.JSONDecodeError: + unparsed_segments.append(segment) + + return parsed_objects, unparsed_segments + +def sanitize_html(html): + """ + Sanitize an HTML string by escaping quotes. + + How it works: + 1. Replaces all unwanted and special characters with an empty string. + 2. Escapes double and single quotes for safe usage. + + Args: + html (str): The HTML string to sanitize. + + Returns: + str: The sanitized HTML string. + """ + + # Replace all unwanted and special characters with an empty string + sanitized_html = html + # sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html) + + # Escape all double and single quotes + sanitized_html = sanitized_html.replace('"', '\\"').replace("'", "\\'") + + return sanitized_html + +def sanitize_input_encode(text: str) -> str: + """Sanitize input to handle potential encoding issues.""" + try: + try: + if not text: + return '' + # Attempt to encode and decode as UTF-8 to handle potential encoding issues + return text.encode('utf-8', errors='ignore').decode('utf-8') + except UnicodeEncodeError as e: + print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}") + # Fall back to ASCII if UTF-8 fails + return text.encode('ascii', errors='ignore').decode('ascii') + except Exception as e: + raise ValueError(f"Error sanitizing input: {str(e)}") from e + +def escape_json_string(s): + """ + Escapes characters in a string to be JSON safe. + + Parameters: + s (str): The input string to be escaped. + + Returns: + str: The escaped string, safe for JSON encoding. + """ + # Replace problematic backslash first + s = s.replace('\\', '\\\\') + + # Replace the double quote + s = s.replace('"', '\\"') + + # Escape control characters + s = s.replace('\b', '\\b') + s = s.replace('\f', '\\f') + s = s.replace('\n', '\\n') + s = s.replace('\r', '\\r') + s = s.replace('\t', '\\t') + + # Additional problematic characters + # Unicode control characters + s = re.sub(r'[\x00-\x1f\x7f-\x9f]', lambda x: '\\u{:04x}'.format(ord(x.group())), s) + + return s + +def replace_inline_tags(soup, tags, only_text=False): + """ + Replace inline HTML tags with Markdown-style equivalents. + + How it works: + 1. Maps specific tags (e.g., , ) to Markdown syntax. + 2. Finds and replaces all occurrences of these tags in the provided BeautifulSoup object. + 3. Optionally replaces tags with their text content only. + + Args: + soup (BeautifulSoup): Parsed HTML content. + tags (List[str]): List of tags to replace. + only_text (bool): Whether to replace tags with plain text. Defaults to False. + + Returns: + BeautifulSoup: Updated BeautifulSoup object with replaced tags. + """ + + tag_replacements = { + 'b': lambda tag: f"**{tag.text}**", + 'i': lambda tag: f"*{tag.text}*", + 'u': lambda tag: f"__{tag.text}__", + 'span': lambda tag: f"{tag.text}", + 'del': lambda tag: f"~~{tag.text}~~", + 'ins': lambda tag: f"++{tag.text}++", + 'sub': lambda tag: f"~{tag.text}~", + 'sup': lambda tag: f"^^{tag.text}^^", + 'strong': lambda tag: f"**{tag.text}**", + 'em': lambda tag: f"*{tag.text}*", + 'code': lambda tag: f"`{tag.text}`", + 'kbd': lambda tag: f"`{tag.text}`", + 'var': lambda tag: f"_{tag.text}_", + 's': lambda tag: f"~~{tag.text}~~", + 'q': lambda tag: f'"{tag.text}"', + 'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})", + 'cite': lambda tag: f"_{tag.text}_", + 'dfn': lambda tag: f"_{tag.text}_", + 'time': lambda tag: f"{tag.text}", + 'small': lambda tag: f"{tag.text}", + 'mark': lambda tag: f"=={tag.text}==" + } + + replacement_data = [(tag, tag_replacements.get(tag, lambda t: t.text)) for tag in tags] + + for tag_name, replacement_func in replacement_data: + for tag in soup.find_all(tag_name): + replacement_text = tag.text if only_text else replacement_func(tag) + tag.replace_with(replacement_text) + + return soup + + # for tag_name in tags: + # for tag in soup.find_all(tag_name): + # if not only_text: + # replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag) + # tag.replace_with(replacement_text) + # else: + # tag.replace_with(tag.text) + + # return soup + +def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs): + """ + Extract structured content, media, and links from website HTML. + + How it works: + 1. Parses the HTML content using BeautifulSoup. + 2. Extracts internal/external links and media (images, videos, audios). + 3. Cleans the content by removing unwanted tags and attributes. + 4. Converts cleaned HTML to Markdown. + 5. Collects metadata and returns the extracted information. + + Args: + url (str): The website URL. + html (str): The HTML content of the website. + word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD. + css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None. + + Returns: + Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata. + """ + + try: + if not html: + return None + # Parse HTML content with BeautifulSoup + soup = BeautifulSoup(html, 'html.parser') + + # Get the content within the tag + body = soup.body + + # If css_selector is provided, extract content based on the selector + if css_selector: + selected_elements = body.select(css_selector) + if not selected_elements: + raise InvalidCSSSelectorError(f"Invalid CSS selector , No elements found for CSS selector: {css_selector}") + div_tag = soup.new_tag('div') + for el in selected_elements: + div_tag.append(el) + body = div_tag + + links = { + 'internal': [], + 'external': [] + } + + # Extract all internal and external links + for a in body.find_all('a', href=True): + href = a['href'] + url_base = url.split('/')[2] + if href.startswith('http') and url_base not in href: + links['external'].append({ + 'href': href, + 'text': a.get_text() + }) + else: + links['internal'].append( + { + 'href': href, + 'text': a.get_text() + } + ) + + # Remove script, style, and other tags that don't carry useful content from body + for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']): + tag.decompose() + + # Remove all attributes from remaining tags in body, except for img tags + for tag in body.find_all(): + if tag.name != 'img': + tag.attrs = {} + + # Extract all img tgas int0 [{src: '', alt: ''}] + media = { + 'images': [], + 'videos': [], + 'audios': [] + } + for img in body.find_all('img'): + media['images'].append({ + 'src': img.get('src'), + 'alt': img.get('alt'), + "type": "image" + }) + + # Extract all video tags into [{src: '', alt: ''}] + for video in body.find_all('video'): + media['videos'].append({ + 'src': video.get('src'), + 'alt': video.get('alt'), + "type": "video" + }) + + # Extract all audio tags into [{src: '', alt: ''}] + for audio in body.find_all('audio'): + media['audios'].append({ + 'src': audio.get('src'), + 'alt': audio.get('alt'), + "type": "audio" + }) + + # Replace images with their alt text or remove them if no alt text is available + for img in body.find_all('img'): + alt_text = img.get('alt') + if alt_text: + img.replace_with(soup.new_string(alt_text)) + else: + img.decompose() + + + # Create a function that replace content of all"pre" tag with its inner text + def replace_pre_tags_with_text(node): + for child in node.find_all('pre'): + # set child inner html to its text + child.string = child.get_text() + return node + + # Replace all "pre" tags with their inner text + body = replace_pre_tags_with_text(body) + + # Replace inline tags with their text content + body = replace_inline_tags( + body, + ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'], + only_text=kwargs.get('only_text', False) + ) + + # Recursively remove empty elements, their parent elements, and elements with word count below threshold + def remove_empty_and_low_word_count_elements(node, word_count_threshold): + for child in node.contents: + if isinstance(child, element.Tag): + remove_empty_and_low_word_count_elements(child, word_count_threshold) + word_count = len(child.get_text(strip=True).split()) + if (len(child.contents) == 0 and not child.get_text(strip=True)) or word_count < word_count_threshold: + child.decompose() + return node + + body = remove_empty_and_low_word_count_elements(body, word_count_threshold) + + def remove_small_text_tags(body: Tag, word_count_threshold: int = MIN_WORD_THRESHOLD): + # We'll use a list to collect all tags that don't meet the word count requirement + tags_to_remove = [] + + # Traverse all tags in the body + for tag in body.find_all(True): # True here means all tags + # Check if the tag contains text and if it's not just whitespace + if tag.string and tag.string.strip(): + # Split the text by spaces and count the words + word_count = len(tag.string.strip().split()) + # If the word count is less than the threshold, mark the tag for removal + if word_count < word_count_threshold: + tags_to_remove.append(tag) + + # Remove all marked tags from the tree + for tag in tags_to_remove: + tag.decompose() # or tag.extract() to remove and get the element + + return body + + + # Remove small text tags + body = remove_small_text_tags(body, word_count_threshold) + + def is_empty_or_whitespace(tag: Tag): + if isinstance(tag, NavigableString): + return not tag.strip() + # Check if the tag itself is empty or all its children are empty/whitespace + if not tag.contents: + return True + return all(is_empty_or_whitespace(child) for child in tag.contents) + + def remove_empty_tags(body: Tag): + # Continue processing until no more changes are made + changes = True + while changes: + changes = False + # Collect all tags that are empty or contain only whitespace + empty_tags = [tag for tag in body.find_all(True) if is_empty_or_whitespace(tag)] + for tag in empty_tags: + # If a tag is empty, decompose it + tag.decompose() + changes = True # Mark that a change was made + + return body + + + # Remove empty tags + body = remove_empty_tags(body) + + # Flatten nested elements with only one child of the same type + def flatten_nested_elements(node): + for child in node.contents: + if isinstance(child, element.Tag): + flatten_nested_elements(child) + if len(child.contents) == 1 and child.contents[0].name == child.name: + # print('Flattening:', child.name) + child_content = child.contents[0] + child.replace_with(child_content) + + return node + + body = flatten_nested_elements(body) + + + + # Remove comments + for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): + comment.extract() + + # Remove consecutive empty newlines and replace multiple spaces with a single space + cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') + + # Sanitize the cleaned HTML content + cleaned_html = sanitize_html(cleaned_html) + # sanitized_html = escape_json_string(cleaned_html) + + # Convert cleaned HTML to Markdown + h = html2text.HTML2Text() + h = CustomHTML2Text() + h.ignore_links = True + markdown = h.handle(cleaned_html) + markdown = markdown.replace(' ```', '```') + + try: + meta = extract_metadata(html, soup) + except Exception as e: + print('Error extracting metadata:', str(e)) + meta = {} + + + # Return the Markdown content + return{ + 'markdown': markdown, + 'cleaned_html': cleaned_html, + 'success': True, + 'media': media, + 'links': links, + 'metadata': meta + } + + except Exception as e: + print('Error processing HTML content:', str(e)) + raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e + +def get_content_of_website_optimized(url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]: + if not html: + return None + + soup = BeautifulSoup(html, 'html.parser') + body = soup.body + + image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) + + for tag in kwargs.get('excluded_tags', []) or []: + for el in body.select(tag): + el.decompose() + + if css_selector: + selected_elements = body.select(css_selector) + if not selected_elements: + raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}") + body = soup.new_tag('div') + for el in selected_elements: + body.append(el) + + links = {'internal': [], 'external': []} + media = {'images': [], 'videos': [], 'audios': []} + + # Extract meaningful text for media files from closest parent + def find_closest_parent_with_useful_text(tag): + current_tag = tag + while current_tag: + current_tag = current_tag.parent + # Get the text content from the parent tag + if current_tag: + text_content = current_tag.get_text(separator=' ',strip=True) + # Check if the text content has at least word_count_threshold + if len(text_content.split()) >= image_description_min_word_threshold: + return text_content + return None + + def process_image(img, url, index, total_images): + #Check if an image has valid display and inside undesired html elements + def is_valid_image(img, parent, parent_classes): + style = img.get('style', '') + src = img.get('src', '') + classes_to_check = ['button', 'icon', 'logo'] + tags_to_check = ['button', 'input'] + return all([ + 'display:none' not in style, + src, + not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check), + parent.name not in tags_to_check + ]) + + #Score an image for it's usefulness + def score_image_for_usefulness(img, base_url, index, images_count): + # Function to parse image height/width value and units + def parse_dimension(dimension): + if dimension: + match = re.match(r"(\d+)(\D*)", dimension) + if match: + number = int(match.group(1)) + unit = match.group(2) or 'px' # Default unit is 'px' if not specified + return number, unit + return None, None + + # Fetch image file metadata to extract size and extension + def fetch_image_file_size(img, base_url): + #If src is relative path construct full URL, if not it may be CDN URL + img_url = urljoin(base_url,img.get('src')) + try: + response = requests.head(img_url) + if response.status_code == 200: + return response.headers.get('Content-Length',None) + else: + print(f"Failed to retrieve file size for {img_url}") + return None + except InvalidSchema as e: + return None + finally: + return + + image_height = img.get('height') + height_value, height_unit = parse_dimension(image_height) + image_width = img.get('width') + width_value, width_unit = parse_dimension(image_width) + image_size = 0 #int(fetch_image_file_size(img,base_url) or 0) + image_format = os.path.splitext(img.get('src',''))[1].lower() + # Remove . from format + image_format = image_format.strip('.') + score = 0 + if height_value: + if height_unit == 'px' and height_value > 150: + score += 1 + if height_unit in ['%','vh','vmin','vmax'] and height_value >30: + score += 1 + if width_value: + if width_unit == 'px' and width_value > 150: + score += 1 + if width_unit in ['%','vh','vmin','vmax'] and width_value >30: + score += 1 + if image_size > 10000: + score += 1 + if img.get('alt') != '': + score+=1 + if any(image_format==format for format in ['jpg','png','webp']): + score+=1 + if index/images_count<0.5: + score+=1 + return score + + if not is_valid_image(img, img.parent, img.parent.get('class', [])): + return None + score = score_image_for_usefulness(img, url, index, total_images) + if score <= IMAGE_SCORE_THRESHOLD: + return None + return { + 'src': img.get('src', '').replace('\\"', '"').strip(), + 'alt': img.get('alt', ''), + 'desc': find_closest_parent_with_useful_text(img), + 'score': score, + 'type': 'image' + } + + def process_element(element: element.PageElement) -> bool: + try: + if isinstance(element, NavigableString): + if isinstance(element, Comment): + element.extract() + return False + + if element.name in ['script', 'style', 'link', 'meta', 'noscript']: + element.decompose() + return False + + keep_element = False + + if element.name == 'a' and element.get('href'): + href = element['href'] + url_base = url.split('/')[2] + link_data = {'href': href, 'text': element.get_text()} + if href.startswith('http') and url_base not in href: + links['external'].append(link_data) + else: + links['internal'].append(link_data) + keep_element = True + + elif element.name == 'img': + return True # Always keep image elements + + elif element.name in ['video', 'audio']: + media[f"{element.name}s"].append({ + 'src': element.get('src'), + 'alt': element.get('alt'), + 'type': element.name, + 'description': find_closest_parent_with_useful_text(element) + }) + source_tags = element.find_all('source') + for source_tag in source_tags: + media[f"{element.name}s"].append({ + 'src': source_tag.get('src'), + 'alt': element.get('alt'), + 'type': element.name, + 'description': find_closest_parent_with_useful_text(element) + }) + return True # Always keep video and audio elements + + if element.name != 'pre': + if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']: + if kwargs.get('only_text', False): + element.replace_with(element.get_text()) + else: + element.unwrap() + elif element.name != 'img': + element.attrs = {} + + # Process children + for child in list(element.children): + if isinstance(child, NavigableString) and not isinstance(child, Comment): + if len(child.strip()) > 0: + keep_element = True + else: + if process_element(child): + keep_element = True + + + # Check word count + if not keep_element: + word_count = len(element.get_text(strip=True).split()) + keep_element = word_count >= word_count_threshold + + if not keep_element: + element.decompose() + + return keep_element + except Exception as e: + print('Error processing element:', str(e)) + return False + + #process images by filtering and extracting contextual text from the page + imgs = body.find_all('img') + media['images'] = [ + result for result in + (process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs)) + if result is not None + ] + + process_element(body) + + def flatten_nested_elements(node): + if isinstance(node, NavigableString): + return node + if len(node.contents) == 1 and isinstance(node.contents[0], element.Tag) and node.contents[0].name == node.name: + return flatten_nested_elements(node.contents[0]) + node.contents = [flatten_nested_elements(child) for child in node.contents] + return node + + body = flatten_nested_elements(body) + base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)') + for img in imgs: + try: + src = img.get('src', '') + if base64_pattern.match(src): + img['src'] = base64_pattern.sub('', src) + except: + pass + + cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') + cleaned_html = sanitize_html(cleaned_html) + + h = CustomHTML2Text() + h.ignore_links = True + markdown = h.handle(cleaned_html) + markdown = markdown.replace(' ```', '```') + + try: + meta = extract_metadata(html, soup) + except Exception as e: + print('Error extracting metadata:', str(e)) + meta = {} + + return { + 'markdown': markdown, + 'cleaned_html': cleaned_html, + 'success': True, + 'media': media, + 'links': links, + 'metadata': meta + } + +def extract_metadata(html, soup=None): + """ + Extract optimized content, media, and links from website HTML. + + How it works: + 1. Similar to `get_content_of_website`, but optimized for performance. + 2. Filters and scores images for usefulness. + 3. Extracts contextual descriptions for media files. + 4. Handles excluded tags and CSS selectors. + 5. Cleans HTML and converts it to Markdown. + + Args: + url (str): The website URL. + html (str): The HTML content of the website. + word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD. + css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None. + **kwargs: Additional options for customization. + + Returns: + Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata. + """ + + metadata = {} + + if not html and not soup: + return {} + + if not soup: + soup = BeautifulSoup(html, 'lxml') + + head = soup.head + if not head: + return metadata + + # Title + title_tag = head.find('title') + metadata['title'] = title_tag.string.strip() if title_tag and title_tag.string else None + + # Meta description + description_tag = head.find('meta', attrs={'name': 'description'}) + metadata['description'] = description_tag.get('content', '').strip() if description_tag else None + + # Meta keywords + keywords_tag = head.find('meta', attrs={'name': 'keywords'}) + metadata['keywords'] = keywords_tag.get('content', '').strip() if keywords_tag else None + + # Meta author + author_tag = head.find('meta', attrs={'name': 'author'}) + metadata['author'] = author_tag.get('content', '').strip() if author_tag else None + + # Open Graph metadata + og_tags = head.find_all('meta', attrs={'property': re.compile(r'^og:')}) + for tag in og_tags: + property_name = tag.get('property', '').strip() + content = tag.get('content', '').strip() + if property_name and content: + metadata[property_name] = content + + # Twitter Card metadata + twitter_tags = head.find_all('meta', attrs={'name': re.compile(r'^twitter:')}) + for tag in twitter_tags: + property_name = tag.get('name', '').strip() + content = tag.get('content', '').strip() + if property_name and content: + metadata[property_name] = content + + return metadata + +def extract_xml_tags(string): + """ + Extracts XML tags from a string. + + Args: + string (str): The input string containing XML tags. + + Returns: + List[str]: A list of XML tags extracted from the input string. + """ + tags = re.findall(r'<(\w+)>', string) + return list(set(tags)) + +def extract_xml_data(tags, string): + """ + Extract data for specified XML tags from a string. + + How it works: + 1. Searches the string for each tag using regex. + 2. Extracts the content within the tags. + 3. Returns a dictionary of tag-content pairs. + + Args: + tags (List[str]): The list of XML tags to extract. + string (str): The input string containing XML data. + + Returns: + Dict[str, str]: A dictionary with tag names as keys and extracted content as values. + """ + + data = {} + + for tag in tags: + pattern = f"<{tag}>(.*?)" + match = re.search(pattern, string, re.DOTALL) + if match: + data[tag] = match.group(1).strip() + else: + data[tag] = "" + + return data + +def perform_completion_with_backoff( + provider, + prompt_with_variables, + api_token, + json_response = False, + base_url=None, + **kwargs + ): + """ + Perform an API completion request with exponential backoff. + + How it works: + 1. Sends a completion request to the API. + 2. Retries on rate-limit errors with exponential delays. + 3. Returns the API response or an error after all retries. + + Args: + provider (str): The name of the API provider. + prompt_with_variables (str): The input prompt for the completion request. + api_token (str): The API token for authentication. + json_response (bool): Whether to request a JSON response. Defaults to False. + base_url (Optional[str]): The base URL for the API. Defaults to None. + **kwargs: Additional arguments for the API request. + + Returns: + dict: The API response or an error message after all retries. + """ + + from litellm import completion + from litellm.exceptions import RateLimitError + max_attempts = 3 + base_delay = 2 # Base delay in seconds, you can adjust this based on your needs + + extra_args = { + "temperature": 0.01, + 'api_key': api_token, + 'base_url': base_url + } + if json_response: + extra_args["response_format"] = { "type": "json_object" } + + if kwargs.get("extra_args"): + extra_args.update(kwargs["extra_args"]) + + for attempt in range(max_attempts): + try: + + response =completion( + model=provider, + messages=[ + {"role": "user", "content": prompt_with_variables} + ], + **extra_args + ) + return response # Return the successful response + except RateLimitError as e: + print("Rate limit error:", str(e)) + + # Check if we have exhausted our max attempts + if attempt < max_attempts - 1: + # Calculate the delay and wait + delay = base_delay * (2 ** attempt) # Exponential backoff formula + print(f"Waiting for {delay} seconds before retrying...") + time.sleep(delay) + else: + # Return an error response after exhausting all retries + return [{ + "index": 0, + "tags": ["error"], + "content": ["Rate limit error. Please try again later."] + }] + +def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, base_url = None): + """ + Extract content blocks from website HTML using an AI provider. + + How it works: + 1. Prepares a prompt by sanitizing and escaping HTML. + 2. Sends the prompt to an AI provider with optional retries. + 3. Parses the response to extract structured blocks or errors. + + Args: + url (str): The website URL. + html (str): The HTML content of the website. + provider (str): The AI provider for content extraction. Defaults to DEFAULT_PROVIDER. + api_token (Optional[str]): The API token for authentication. Defaults to None. + base_url (Optional[str]): The base URL for the API. Defaults to None. + + Returns: + List[dict]: A list of extracted content blocks. + """ + + # api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token + api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token + + variable_values = { + "URL": url, + "HTML": escape_json_string(sanitize_html(html)), + } + + prompt_with_variables = PROMPT_EXTRACT_BLOCKS + for variable in variable_values: + prompt_with_variables = prompt_with_variables.replace( + "{" + variable + "}", variable_values[variable] + ) + + response = perform_completion_with_backoff(provider, prompt_with_variables, api_token, base_url=base_url) + + try: + blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] + blocks = json.loads(blocks) + ## Add error: False to the blocks + for block in blocks: + block['error'] = False + except Exception as e: + parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content) + blocks = parsed + # Append all unparsed segments as onr error block and content is list of unparsed segments + if unparsed: + blocks.append({ + "index": 0, + "error": True, + "tags": ["error"], + "content": unparsed + }) + return blocks + +def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_token = None): + """ + Extract content blocks from a batch of website HTMLs. + + How it works: + 1. Prepares prompts for each URL and HTML pair. + 2. Sends the prompts to the AI provider in a batch request. + 3. Parses the responses to extract structured blocks or errors. + + Args: + batch_data (List[Tuple[str, str]]): A list of (URL, HTML) pairs. + provider (str): The AI provider for content extraction. Defaults to "groq/llama3-70b-8192". + api_token (Optional[str]): The API token for authentication. Defaults to None. + + Returns: + List[dict]: A list of extracted content blocks from all batch items. + """ + + api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token + from litellm import batch_completion + messages = [] + + for url, html in batch_data: + variable_values = { + "URL": url, + "HTML": html, + } + + prompt_with_variables = PROMPT_EXTRACT_BLOCKS + for variable in variable_values: + prompt_with_variables = prompt_with_variables.replace( + "{" + variable + "}", variable_values[variable] + ) + + messages.append([{"role": "user", "content": prompt_with_variables}]) + + + responses = batch_completion( + model = provider, + messages = messages, + temperature = 0.01 + ) + + all_blocks = [] + for response in responses: + try: + blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] + blocks = json.loads(blocks) + + except Exception as e: + blocks = [{ + "index": 0, + "tags": ["error"], + "content": ["Error extracting blocks from the HTML content. Choose another provider/model or try again."], + "questions": ["What went wrong during the block extraction process?"] + }] + all_blocks.append(blocks) + + return sum(all_blocks, []) + +def merge_chunks_based_on_token_threshold(chunks, token_threshold): + """ + Merges small chunks into larger ones based on the total token threshold. + + :param chunks: List of text chunks to be merged based on token count. + :param token_threshold: Max number of tokens for each merged chunk. + :return: List of merged text chunks. + """ + merged_sections = [] + current_chunk = [] + total_token_so_far = 0 + + for chunk in chunks: + chunk_token_count = len(chunk.split()) * 1.3 # Estimate token count with a factor + if total_token_so_far + chunk_token_count < token_threshold: + current_chunk.append(chunk) + total_token_so_far += chunk_token_count + else: + if current_chunk: + merged_sections.append('\n\n'.join(current_chunk)) + current_chunk = [chunk] + total_token_so_far = chunk_token_count + + # Add the last chunk if it exists + if current_chunk: + merged_sections.append('\n\n'.join(current_chunk)) + + return merged_sections + +def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list: + """ + Process sections of HTML content sequentially or in parallel. + + How it works: + 1. Sequentially processes sections with delays for "groq/" providers. + 2. Uses ThreadPoolExecutor for parallel processing with other providers. + 3. Extracts content blocks for each section. + + Args: + url (str): The website URL. + sections (List[str]): The list of HTML sections to process. + provider (str): The AI provider for content extraction. + api_token (str): The API token for authentication. + base_url (Optional[str]): The base URL for the API. Defaults to None. + + Returns: + List[dict]: The list of extracted content blocks from all sections. + """ + + extracted_content = [] + if provider.startswith("groq/"): + # Sequential processing with a delay + for section in sections: + extracted_content.extend(extract_blocks(url, section, provider, api_token, base_url=base_url)) + time.sleep(0.5) # 500 ms delay between each processing + else: + # Parallel processing using ThreadPoolExecutor + with ThreadPoolExecutor() as executor: + futures = [executor.submit(extract_blocks, url, section, provider, api_token, base_url=base_url) for section in sections] + for future in as_completed(futures): + extracted_content.extend(future.result()) + + return extracted_content + +def wrap_text(draw, text, font, max_width): + """ + Wrap text to fit within a specified width for rendering. + + How it works: + 1. Splits the text into words. + 2. Constructs lines that fit within the maximum width using the provided font. + 3. Returns the wrapped text as a single string. + + Args: + draw (ImageDraw.Draw): The drawing context for measuring text size. + text (str): The text to wrap. + font (ImageFont.FreeTypeFont): The font to use for measuring text size. + max_width (int): The maximum width for each line. + + Returns: + str: The wrapped text. + """ + + # Wrap the text to fit within the specified width + lines = [] + words = text.split() + while words: + line = '' + while words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width: + line += (words.pop(0) + ' ') + lines.append(line) + return '\n'.join(lines) + +def format_html(html_string): + """ + Prettify an HTML string using BeautifulSoup. + + How it works: + 1. Parses the HTML string with BeautifulSoup. + 2. Formats the HTML with proper indentation. + 3. Returns the prettified HTML string. + + Args: + html_string (str): The HTML string to format. + + Returns: + str: The prettified HTML string. + """ + + soup = BeautifulSoup(html_string, 'lxml.parser') + return soup.prettify() + +def fast_format_html(html_string): + """ + A fast HTML formatter that uses string operations instead of parsing. + + Args: + html_string (str): The HTML string to format + + Returns: + str: The formatted HTML string + """ + # Initialize variables + indent = 0 + indent_str = " " # Two spaces for indentation + formatted = [] + in_content = False + + # Split by < and > to separate tags and content + parts = html_string.replace('>', '>\n').replace('<', '\n<').split('\n') + + for part in parts: + if not part.strip(): + continue + + # Handle closing tags + if part.startswith(''): + formatted.append(indent_str * indent + part) + + # Handle opening tags + elif part.startswith('<'): + formatted.append(indent_str * indent + part) + indent += 1 + + # Handle content between tags + else: + content = part.strip() + if content: + formatted.append(indent_str * indent + content) + + return '\n'.join(formatted) + +def normalize_url(href, base_url): + """Normalize URLs to ensure consistent format""" + from urllib.parse import urljoin, urlparse + + # Parse base URL to get components + parsed_base = urlparse(base_url) + if not parsed_base.scheme or not parsed_base.netloc: + raise ValueError(f"Invalid base URL format: {base_url}") + + # Use urljoin to handle all cases + normalized = urljoin(base_url, href.strip()) + return normalized + +def normalize_url_tmp(href, base_url): + """Normalize URLs to ensure consistent format""" + # Extract protocol and domain from base URL + try: + base_parts = base_url.split('/') + protocol = base_parts[0] + domain = base_parts[2] + except IndexError: + raise ValueError(f"Invalid base URL format: {base_url}") + + # Handle special protocols + special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'} + if any(href.lower().startswith(proto) for proto in special_protocols): + return href.strip() + + # Handle anchor links + if href.startswith('#'): + return f"{base_url}{href}" + + # Handle protocol-relative URLs + if href.startswith('//'): + return f"{protocol}{href}" + + # Handle root-relative URLs + if href.startswith('/'): + return f"{protocol}//{domain}{href}" + + # Handle relative URLs + if not href.startswith(('http://', 'https://')): + # Remove leading './' if present + href = href.lstrip('./') + return f"{protocol}//{domain}/{href}" + + return href.strip() + +def get_base_domain(url: str) -> str: + """ + Extract the base domain from a given URL, handling common edge cases. + + How it works: + 1. Parses the URL to extract the domain. + 2. Removes the port number and 'www' prefix. + 3. Handles special domains (e.g., 'co.uk') to extract the correct base. + + Args: + url (str): The URL to extract the base domain from. + + Returns: + str: The extracted base domain or an empty string if parsing fails. + """ + try: + # Get domain from URL + domain = urlparse(url).netloc.lower() + if not domain: + return "" + + # Remove port if present + domain = domain.split(':')[0] + + # Remove www + domain = re.sub(r'^www\.', '', domain) + + # Extract last two parts of domain (handles co.uk etc) + parts = domain.split('.') + if len(parts) > 2 and parts[-2] in { + 'co', 'com', 'org', 'gov', 'edu', 'net', + 'mil', 'int', 'ac', 'ad', 'ae', 'af', 'ag' + }: + return '.'.join(parts[-3:]) + + return '.'.join(parts[-2:]) + except Exception: + return "" + +def is_external_url(url: str, base_domain: str) -> bool: + """ + Extract the base domain from a given URL, handling common edge cases. + + How it works: + 1. Parses the URL to extract the domain. + 2. Removes the port number and 'www' prefix. + 3. Handles special domains (e.g., 'co.uk') to extract the correct base. + + Args: + url (str): The URL to extract the base domain from. + + Returns: + str: The extracted base domain or an empty string if parsing fails. + """ + special = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'} + if any(url.lower().startswith(p) for p in special): + return True + + try: + parsed = urlparse(url) + if not parsed.netloc: # Relative URL + return False + + # Strip 'www.' from both domains for comparison + url_domain = parsed.netloc.lower().replace('www.', '') + base = base_domain.lower().replace('www.', '') + + # Check if URL domain ends with base domain + return not url_domain.endswith(base) + except Exception: + return False + +def clean_tokens(tokens: list[str]) -> list[str]: + """ + Clean a list of tokens by removing noise, stop words, and short tokens. + + How it works: + 1. Defines a set of noise words and stop words. + 2. Filters tokens based on length and exclusion criteria. + 3. Excludes tokens starting with certain symbols (e.g., "↑", "▲"). + + Args: + tokens (list[str]): The list of tokens to clean. + + Returns: + list[str]: The cleaned list of tokens. + """ + + # Set of tokens to remove + noise = {'ccp', 'up', '↑', '▲', '⬆️', 'a', 'an', 'at', 'by', 'in', 'of', 'on', 'to', 'the'} + + STOP_WORDS = { + 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', + 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', + 'to', 'was', 'were', 'will', 'with', + + # Pronouns + 'i', 'you', 'he', 'she', 'it', 'we', 'they', + 'me', 'him', 'her', 'us', 'them', + 'my', 'your', 'his', 'her', 'its', 'our', 'their', + 'mine', 'yours', 'hers', 'ours', 'theirs', + 'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves', + + # Common verbs + 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', + 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', + + # Prepositions + 'about', 'above', 'across', 'after', 'against', 'along', 'among', 'around', + 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond', + 'by', 'down', 'during', 'except', 'for', 'from', 'in', 'inside', 'into', + 'near', 'of', 'off', 'on', 'out', 'outside', 'over', 'past', 'through', + 'to', 'toward', 'under', 'underneath', 'until', 'up', 'upon', 'with', 'within', + + # Conjunctions + 'and', 'but', 'or', 'nor', 'for', 'yet', 'so', + 'although', 'because', 'since', 'unless', + + # Articles + 'a', 'an', 'the', + + # Other common words + 'this', 'that', 'these', 'those', + 'what', 'which', 'who', 'whom', 'whose', + 'when', 'where', 'why', 'how', + 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', + 'can', 'cannot', "can't", 'could', "couldn't", + 'may', 'might', 'must', "mustn't", + 'shall', 'should', "shouldn't", + 'will', "won't", 'would', "wouldn't", + 'not', "n't", 'no', 'nor', 'none' + } + + # Single comprehension, more efficient than multiple passes + return [token for token in tokens + if len(token) > 2 + and token not in noise + and token not in STOP_WORDS + and not token.startswith('↑') + and not token.startswith('▲') + and not token.startswith('⬆')] + +def profile_and_time(func): + """ + Decorator to profile a function's execution time and performance. + + How it works: + 1. Records the start time before executing the function. + 2. Profiles the function's execution using `cProfile`. + 3. Prints the elapsed time and profiling statistics. + + Args: + func (Callable): The function to decorate. + + Returns: + Callable: The decorated function with profiling and timing enabled. + """ + + @wraps(func) + def wrapper(self, *args, **kwargs): + # Start timer + start_time = time.perf_counter() + + # Setup profiler + profiler = cProfile.Profile() + profiler.enable() + + # Run function + result = func(self, *args, **kwargs) + + # Stop profiler + profiler.disable() + + # Calculate elapsed time + elapsed_time = time.perf_counter() - start_time + + # Print timing + print(f"[PROFILER] Scraping completed in {elapsed_time:.2f} seconds") + + # Print profiling stats + stats = pstats.Stats(profiler) + stats.sort_stats('cumulative') # Sort by cumulative time + stats.print_stats(20) # Print top 20 time-consuming functions + + return result + return wrapper + +def generate_content_hash(content: str) -> str: + """Generate a unique hash for content""" + return xxhash.xxh64(content.encode()).hexdigest() + # return hashlib.sha256(content.encode()).hexdigest() + +def ensure_content_dirs(base_path: str) -> Dict[str, str]: + """Create content directories if they don't exist""" + dirs = { + 'html': 'html_content', + 'cleaned': 'cleaned_html', + 'markdown': 'markdown_content', + 'extracted': 'extracted_content', + 'screenshots': 'screenshots', + 'screenshot': 'screenshots' + } + + content_paths = {} + for key, dirname in dirs.items(): + path = os.path.join(base_path, dirname) + os.makedirs(path, exist_ok=True) + content_paths[key] = path + + return content_paths + +def configure_windows_event_loop(): + """ + Configure the Windows event loop to use ProactorEventLoop. + This resolves the NotImplementedError that occurs on Windows when using asyncio subprocesses. + + This function should only be called on Windows systems and before any async operations. + On non-Windows systems, this function does nothing. + + Example: + ```python + from crawl4ai.async_configs import configure_windows_event_loop + + # Call this before any async operations if you're on Windows + configure_windows_event_loop() + ``` + """ + if platform.system() == 'Windows': + asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) + +def get_error_context(exc_info, context_lines: int = 5): + """ + Extract error context with more reliable line number tracking. + + Args: + exc_info: The exception info from sys.exc_info() + context_lines: Number of lines to show before and after the error + + Returns: + dict: Error context information + """ + import traceback + import linecache + import os + + # Get the full traceback + tb = traceback.extract_tb(exc_info[2]) + + # Get the last frame (where the error occurred) + last_frame = tb[-1] + filename = last_frame.filename + line_no = last_frame.lineno + func_name = last_frame.name + + # Get the source code context using linecache + # This is more reliable than inspect.getsourcelines + context_start = max(1, line_no - context_lines) + context_end = line_no + context_lines + 1 + + # Build the context lines with line numbers + context_lines = [] + for i in range(context_start, context_end): + line = linecache.getline(filename, i) + if line: + # Remove any trailing whitespace/newlines and add the pointer for error line + line = line.rstrip() + pointer = '→' if i == line_no else ' ' + context_lines.append(f"{i:4d} {pointer} {line}") + + # Join the lines with newlines + code_context = '\n'.join(context_lines) + + # Get relative path for cleaner output + try: + rel_path = os.path.relpath(filename) + except ValueError: + # Fallback if relpath fails (can happen on Windows with different drives) + rel_path = filename + + return { + "filename": rel_path, + "line_no": line_no, + "function": func_name, + "code_context": code_context + } + + + \ No newline at end of file diff --git a/crawl4ai/utils.scraping.py b/crawl4ai/utils.scraping.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/crawl4ai/version_manager.py b/crawl4ai/version_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..8ae2de2e937fae526351b8dfd586567d21bc7be3 --- /dev/null +++ b/crawl4ai/version_manager.py @@ -0,0 +1,30 @@ +# version_manager.py +import os +from pathlib import Path +from packaging import version +from . import __version__ + +class VersionManager: + def __init__(self): + self.home_dir = Path.home() / ".crawl4ai" + self.version_file = self.home_dir / "version.txt" + + def get_installed_version(self): + """Get the version recorded in home directory""" + if not self.version_file.exists(): + return None + try: + return version.parse(self.version_file.read_text().strip()) + except: + return None + + def update_version(self): + """Update the version file to current library version""" + self.version_file.write_text(__version__.__version__) + + def needs_update(self): + """Check if database needs update based on version""" + installed = self.get_installed_version() + current = version.parse(__version__.__version__) + return installed is None or installed < current + diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py new file mode 100644 index 0000000000000000000000000000000000000000..a32a988d70767397102c054a95391fc00d0f8145 --- /dev/null +++ b/crawl4ai/web_crawler.py @@ -0,0 +1,253 @@ +import os, time +os.environ["TOKENIZERS_PARALLELISM"] = "false" +from pathlib import Path + +from .models import UrlModel, CrawlResult +from .database import init_db, get_cached_url, cache_url, DB_PATH, flush_db +from .utils import * +from .chunking_strategy import * +from .extraction_strategy import * +from .crawler_strategy import * +from typing import List +from concurrent.futures import ThreadPoolExecutor +from .content_scraping_strategy import WebScrapingStrategy +from .config import * +import warnings +import json +warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace "model_".') + + +class WebCrawler: + def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False): + self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose) + self.always_by_pass_cache = always_by_pass_cache + self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") + os.makedirs(self.crawl4ai_folder, exist_ok=True) + os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) + init_db() + self.ready = False + + def warmup(self): + print("[LOG] 🌤️ Warming up the WebCrawler") + self.run( + url='https://google.com/', + word_count_threshold=5, + extraction_strategy=NoExtractionStrategy(), + bypass_cache=False, + verbose=False + ) + self.ready = True + print("[LOG] 🌞 WebCrawler is ready to crawl") + + def fetch_page( + self, + url_model: UrlModel, + provider: str = DEFAULT_PROVIDER, + api_token: str = None, + extract_blocks_flag: bool = True, + word_count_threshold=MIN_WORD_THRESHOLD, + css_selector: str = None, + screenshot: bool = False, + use_cached_html: bool = False, + extraction_strategy: ExtractionStrategy = None, + chunking_strategy: ChunkingStrategy = RegexChunking(), + **kwargs, + ) -> CrawlResult: + return self.run( + url_model.url, + word_count_threshold, + extraction_strategy or NoExtractionStrategy(), + chunking_strategy, + bypass_cache=url_model.forced, + css_selector=css_selector, + screenshot=screenshot, + **kwargs, + ) + pass + + def fetch_pages( + self, + url_models: List[UrlModel], + provider: str = DEFAULT_PROVIDER, + api_token: str = None, + extract_blocks_flag: bool = True, + word_count_threshold=MIN_WORD_THRESHOLD, + use_cached_html: bool = False, + css_selector: str = None, + screenshot: bool = False, + extraction_strategy: ExtractionStrategy = None, + chunking_strategy: ChunkingStrategy = RegexChunking(), + **kwargs, + ) -> List[CrawlResult]: + extraction_strategy = extraction_strategy or NoExtractionStrategy() + def fetch_page_wrapper(url_model, *args, **kwargs): + return self.fetch_page(url_model, *args, **kwargs) + + with ThreadPoolExecutor() as executor: + results = list( + executor.map( + fetch_page_wrapper, + url_models, + [provider] * len(url_models), + [api_token] * len(url_models), + [extract_blocks_flag] * len(url_models), + [word_count_threshold] * len(url_models), + [css_selector] * len(url_models), + [screenshot] * len(url_models), + [use_cached_html] * len(url_models), + [extraction_strategy] * len(url_models), + [chunking_strategy] * len(url_models), + *[kwargs] * len(url_models), + ) + ) + + return results + + def run( + self, + url: str, + word_count_threshold=MIN_WORD_THRESHOLD, + extraction_strategy: ExtractionStrategy = None, + chunking_strategy: ChunkingStrategy = RegexChunking(), + bypass_cache: bool = False, + css_selector: str = None, + screenshot: bool = False, + user_agent: str = None, + verbose=True, + **kwargs, + ) -> CrawlResult: + try: + extraction_strategy = extraction_strategy or NoExtractionStrategy() + extraction_strategy.verbose = verbose + if not isinstance(extraction_strategy, ExtractionStrategy): + raise ValueError("Unsupported extraction strategy") + if not isinstance(chunking_strategy, ChunkingStrategy): + raise ValueError("Unsupported chunking strategy") + + word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) + + cached = None + screenshot_data = None + extracted_content = None + if not bypass_cache and not self.always_by_pass_cache: + cached = get_cached_url(url) + + if kwargs.get("warmup", True) and not self.ready: + return None + + if cached: + html = sanitize_input_encode(cached[1]) + extracted_content = sanitize_input_encode(cached[4]) + if screenshot: + screenshot_data = cached[9] + if not screenshot_data: + cached = None + + if not cached or not html: + if user_agent: + self.crawler_strategy.update_user_agent(user_agent) + t1 = time.time() + html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs)) + t2 = time.time() + if verbose: + print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds") + if screenshot: + screenshot_data = self.crawler_strategy.take_screenshot() + + + crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs) + crawl_result.success = bool(html) + return crawl_result + except Exception as e: + if not hasattr(e, "msg"): + e.msg = str(e) + print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}") + return CrawlResult(url=url, html="", success=False, error_message=e.msg) + + def process_html( + self, + url: str, + html: str, + extracted_content: str, + word_count_threshold: int, + extraction_strategy: ExtractionStrategy, + chunking_strategy: ChunkingStrategy, + css_selector: str, + screenshot: bool, + verbose: bool, + is_cached: bool, + **kwargs, + ) -> CrawlResult: + t = time.time() + # Extract content from HTML + try: + t1 = time.time() + scrapping_strategy = WebScrapingStrategy() + extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]} + result = scrapping_strategy.scrap( + url, + html, + word_count_threshold=word_count_threshold, + css_selector=css_selector, + only_text=kwargs.get("only_text", False), + image_description_min_word_threshold=kwargs.get( + "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD + ), + **extra_params, + ) + + # result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) + if verbose: + print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds") + + if result is None: + raise ValueError(f"Failed to extract content from the website: {url}") + except InvalidCSSSelectorError as e: + raise ValueError(str(e)) + + cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) + markdown = sanitize_input_encode(result.get("markdown", "")) + media = result.get("media", []) + links = result.get("links", []) + metadata = result.get("metadata", {}) + + if extracted_content is None: + if verbose: + print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}") + + sections = chunking_strategy.chunk(markdown) + extracted_content = extraction_strategy.run(url, sections) + extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) + + if verbose: + print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds.") + + screenshot = None if not screenshot else screenshot + + if not is_cached: + cache_url( + url, + html, + cleaned_html, + markdown, + extracted_content, + True, + json.dumps(media), + json.dumps(links), + json.dumps(metadata), + screenshot=screenshot, + ) + + return CrawlResult( + url=url, + html=html, + cleaned_html=format_html(cleaned_html), + markdown=markdown, + media=media, + links=links, + metadata=metadata, + screenshot=screenshot, + extracted_content=extracted_content, + success=True, + error_message="", + ) \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000000000000000000000000000000000000..4b22fd9846cc0185ffe281e85ae4378538de282f --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,67 @@ +services: + # Local build services for different platforms + crawl4ai-amd64: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: "3.10" + INSTALL_TYPE: ${INSTALL_TYPE:-basic} + ENABLE_GPU: false + platforms: + - linux/amd64 + profiles: ["local-amd64"] + extends: &base-config + file: docker-compose.yml + service: base-config + + crawl4ai-arm64: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: "3.10" + INSTALL_TYPE: ${INSTALL_TYPE:-basic} + ENABLE_GPU: false + platforms: + - linux/arm64 + profiles: ["local-arm64"] + extends: *base-config + + # Hub services for different platforms and versions + crawl4ai-hub-amd64: + image: unclecode/crawl4ai:${VERSION:-basic}-amd64 + profiles: ["hub-amd64"] + extends: *base-config + + crawl4ai-hub-arm64: + image: unclecode/crawl4ai:${VERSION:-basic}-arm64 + profiles: ["hub-arm64"] + extends: *base-config + + # Base configuration to be extended + base-config: + ports: + - "11235:11235" + - "8000:8000" + - "9222:9222" + - "8080:8080" + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} + volumes: + - /dev/shm:/dev/shm + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s \ No newline at end of file diff --git a/docs/assets/pitch-dark.png b/docs/assets/pitch-dark.png new file mode 100644 index 0000000000000000000000000000000000000000..9b9b37b5dc2d7645a267b733a0d2b4916f5af203 Binary files /dev/null and b/docs/assets/pitch-dark.png differ diff --git a/docs/assets/pitch-dark.svg b/docs/assets/pitch-dark.svg new file mode 100644 index 0000000000000000000000000000000000000000..0913b2dad5f90ff96e73cd690ff315d0adb675d2 --- /dev/null +++ b/docs/assets/pitch-dark.svg @@ -0,0 +1,64 @@ + + + + + + + + + Data Capitalization Opportunity + + Transform digital footprints into assets + Personal data as capital + Enterprise knowledge valuation + New form of wealth creation + + + + + Authentic Data Potential + + Vast reservoir of real insights + Enhanced AI development + Diverse human knowledge + Willing participation model + + + + + + + + 1. Open-Source Foundation + Data extraction engine & community development + + + + 2. Data Capitalization Platform + Tools to structure & value digital assets + + + + 3. Shared Data Marketplace + Economic platform for data exchange + + + + + + + + + + + + + + + + + + + Economic Vision: Shared Data Economy + + \ No newline at end of file diff --git a/docs/deprecated/docker-deployment.md b/docs/deprecated/docker-deployment.md new file mode 100644 index 0000000000000000000000000000000000000000..db8446e324b4f76a8b919e39662ba2f5cb7e52c5 --- /dev/null +++ b/docs/deprecated/docker-deployment.md @@ -0,0 +1,189 @@ +# 🐳 Using Docker (Legacy) + +Crawl4AI is available as Docker images for easy deployment. You can either pull directly from Docker Hub (recommended) or build from the repository. + +--- + +
    +🐳 Option 1: Docker Hub (Recommended) + +Choose the appropriate image based on your platform and needs: + +### For AMD64 (Regular Linux/Windows): +```bash +# Basic version (recommended) +docker pull unclecode/crawl4ai:basic-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:basic-amd64 + +# Full ML/LLM support +docker pull unclecode/crawl4ai:all-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:all-amd64 + +# With GPU support +docker pull unclecode/crawl4ai:gpu-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:gpu-amd64 +``` + +### For ARM64 (M1/M2 Macs, ARM servers): +```bash +# Basic version (recommended) +docker pull unclecode/crawl4ai:basic-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:basic-arm64 + +# Full ML/LLM support +docker pull unclecode/crawl4ai:all-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:all-arm64 + +# With GPU support +docker pull unclecode/crawl4ai:gpu-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:gpu-arm64 +``` + +Need more memory? Add `--shm-size`: +```bash +docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-amd64 +``` + +Test the installation: +```bash +curl http://localhost:11235/health +``` + +### For Raspberry Pi (32-bit) (coming soon): +```bash +# Pull and run basic version (recommended for Raspberry Pi) +docker pull unclecode/crawl4ai:basic-armv7 +docker run -p 11235:11235 unclecode/crawl4ai:basic-armv7 + +# With increased shared memory if needed +docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-armv7 +``` + +Note: Due to hardware constraints, only the basic version is recommended for Raspberry Pi. + +
    + +
    +🐳 Option 2: Build from Repository + +Build the image locally based on your platform: + +```bash +# Clone the repository +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai + +# For AMD64 (Regular Linux/Windows) +docker build --platform linux/amd64 \ + --tag crawl4ai:local \ + --build-arg INSTALL_TYPE=basic \ + . + +# For ARM64 (M1/M2 Macs, ARM servers) +docker build --platform linux/arm64 \ + --tag crawl4ai:local \ + --build-arg INSTALL_TYPE=basic \ + . +``` + +Build options: +- INSTALL_TYPE=basic (default): Basic crawling features +- INSTALL_TYPE=all: Full ML/LLM support +- ENABLE_GPU=true: Add GPU support + +Example with all options: +```bash +docker build --platform linux/amd64 \ + --tag crawl4ai:local \ + --build-arg INSTALL_TYPE=all \ + --build-arg ENABLE_GPU=true \ + . +``` + +Run your local build: +```bash +# Regular run +docker run -p 11235:11235 crawl4ai:local + +# With increased shared memory +docker run --shm-size=2gb -p 11235:11235 crawl4ai:local +``` + +Test the installation: +```bash +curl http://localhost:11235/health +``` + +
    + +
    +🐳 Option 3: Using Docker Compose + +Docker Compose provides a more structured way to run Crawl4AI, especially when dealing with environment variables and multiple configurations. + +```bash +# Clone the repository +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai +``` + +### For AMD64 (Regular Linux/Windows): +```bash +# Build and run locally +docker-compose --profile local-amd64 up + +# Run from Docker Hub +VERSION=basic docker-compose --profile hub-amd64 up # Basic version +VERSION=all docker-compose --profile hub-amd64 up # Full ML/LLM support +VERSION=gpu docker-compose --profile hub-amd64 up # GPU support +``` + +### For ARM64 (M1/M2 Macs, ARM servers): +```bash +# Build and run locally +docker-compose --profile local-arm64 up + +# Run from Docker Hub +VERSION=basic docker-compose --profile hub-arm64 up # Basic version +VERSION=all docker-compose --profile hub-arm64 up # Full ML/LLM support +VERSION=gpu docker-compose --profile hub-arm64 up # GPU support +``` + +Environment variables (optional): +```bash +# Create a .env file +CRAWL4AI_API_TOKEN=your_token +OPENAI_API_KEY=your_openai_key +CLAUDE_API_KEY=your_claude_key +``` + +The compose file includes: +- Memory management (4GB limit, 1GB reserved) +- Shared memory volume for browser support +- Health checks +- Auto-restart policy +- All necessary port mappings + +Test the installation: +```bash +curl http://localhost:11235/health +``` + +
    + +
    +🚀 One-Click Deployment + +Deploy your own instance of Crawl4AI with one click: + +[![DigitalOcean Referral Badge](https://web-platforms.sfo2.cdn.digitaloceanspaces.com/WWW/Badge%203.svg)](https://www.digitalocean.com/?repo=https://github.com/unclecode/crawl4ai/tree/0.3.74&refcode=a0780f1bdb3d&utm_campaign=Referral_Invite&utm_medium=Referral_Program&utm_source=badge) + +> 💡 **Recommended specs**: 4GB RAM minimum. Select "professional-xs" or higher when deploying for stable operation. + +The deploy will: +- Set up a Docker container with Crawl4AI +- Configure Playwright and all dependencies +- Start the FastAPI server on port `11235` +- Set up health checks and auto-deployment + +
    diff --git a/docs/examples/amazon_product_extraction_direct_url.py b/docs/examples/amazon_product_extraction_direct_url.py new file mode 100644 index 0000000000000000000000000000000000000000..769c479e3f040e74c08dbdcc586fcaaddb92d4a4 --- /dev/null +++ b/docs/examples/amazon_product_extraction_direct_url.py @@ -0,0 +1,114 @@ +""" +This example demonstrates how to use JSON CSS extraction to scrape product information +from Amazon search results. It shows how to extract structured data like product titles, +prices, ratings, and other details using CSS selectors. +""" + +from crawl4ai import AsyncWebCrawler +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +import json + +async def extract_amazon_products(): + # Initialize browser config + browser_config = BrowserConfig( + browser_type="chromium", + headless=True + ) + + # Initialize crawler config with JSON CSS extraction strategy + crawler_config = CrawlerRunConfig( + extraction_strategy=JsonCssExtractionStrategy( + schema={ + "name": "Amazon Product Search Results", + "baseSelector": "[data-component-type='s-search-result']", + "fields": [ + { + "name": "asin", + "selector": "", + "type": "attribute", + "attribute": "data-asin" + }, + { + "name": "title", + "selector": "h2 a span", + "type": "text" + }, + { + "name": "url", + "selector": "h2 a", + "type": "attribute", + "attribute": "href" + }, + { + "name": "image", + "selector": ".s-image", + "type": "attribute", + "attribute": "src" + }, + { + "name": "rating", + "selector": ".a-icon-star-small .a-icon-alt", + "type": "text" + }, + { + "name": "reviews_count", + "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span", + "type": "text" + }, + { + "name": "price", + "selector": ".a-price .a-offscreen", + "type": "text" + }, + { + "name": "original_price", + "selector": ".a-price.a-text-price .a-offscreen", + "type": "text" + }, + { + "name": "sponsored", + "selector": ".puis-sponsored-label-text", + "type": "exists" + }, + { + "name": "delivery_info", + "selector": "[data-cy='delivery-recipe'] .a-color-base", + "type": "text", + "multiple": True + } + ] + } + ) + ) + + # Example search URL (you should replace with your actual Amazon URL) + url = "https://www.amazon.com/s?k=Samsung+Galaxy+Tab" + + # Use context manager for proper resource handling + async with AsyncWebCrawler(config=browser_config) as crawler: + # Extract the data + result = await crawler.arun(url=url, config=crawler_config) + + # Process and print the results + if result and result.extracted_content: + # Parse the JSON string into a list of products + products = json.loads(result.extracted_content) + + # Process each product in the list + for product in products: + print("\nProduct Details:") + print(f"ASIN: {product.get('asin')}") + print(f"Title: {product.get('title')}") + print(f"Price: {product.get('price')}") + print(f"Original Price: {product.get('original_price')}") + print(f"Rating: {product.get('rating')}") + print(f"Reviews: {product.get('reviews_count')}") + print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}") + if product.get('delivery_info'): + print(f"Delivery: {' '.join(product['delivery_info'])}") + print("-" * 80) + +if __name__ == "__main__": + import asyncio + asyncio.run(extract_amazon_products()) diff --git a/docs/examples/amazon_product_extraction_using_hooks.py b/docs/examples/amazon_product_extraction_using_hooks.py new file mode 100644 index 0000000000000000000000000000000000000000..a17d60c5944101364897a4eaf620fef634d96e27 --- /dev/null +++ b/docs/examples/amazon_product_extraction_using_hooks.py @@ -0,0 +1,145 @@ +""" +This example demonstrates how to use JSON CSS extraction to scrape product information +from Amazon search results. It shows how to extract structured data like product titles, +prices, ratings, and other details using CSS selectors. +""" + +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +import json +from playwright.async_api import Page, BrowserContext + +async def extract_amazon_products(): + # Initialize browser config + browser_config = BrowserConfig( + # browser_type="chromium", + headless=True + ) + + # Initialize crawler config with JSON CSS extraction strategy nav-search-submit-button + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + + extraction_strategy=JsonCssExtractionStrategy( + schema={ + "name": "Amazon Product Search Results", + "baseSelector": "[data-component-type='s-search-result']", + "fields": [ + { + "name": "asin", + "selector": "", + "type": "attribute", + "attribute": "data-asin" + }, + { + "name": "title", + "selector": "h2 a span", + "type": "text" + }, + { + "name": "url", + "selector": "h2 a", + "type": "attribute", + "attribute": "href" + }, + { + "name": "image", + "selector": ".s-image", + "type": "attribute", + "attribute": "src" + }, + { + "name": "rating", + "selector": ".a-icon-star-small .a-icon-alt", + "type": "text" + }, + { + "name": "reviews_count", + "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span", + "type": "text" + }, + { + "name": "price", + "selector": ".a-price .a-offscreen", + "type": "text" + }, + { + "name": "original_price", + "selector": ".a-price.a-text-price .a-offscreen", + "type": "text" + }, + { + "name": "sponsored", + "selector": ".puis-sponsored-label-text", + "type": "exists" + }, + { + "name": "delivery_info", + "selector": "[data-cy='delivery-recipe'] .a-color-base", + "type": "text", + "multiple": True + } + ] + } + ) + ) + + url = "https://www.amazon.com/" + + async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs): + """Hook called after navigating to each URL""" + print(f"[HOOK] after_goto - Successfully loaded: {url}") + + try: + # Wait for search box to be available + search_box = await page.wait_for_selector('#twotabsearchtextbox', timeout=1000) + + # Type the search query + await search_box.fill('Samsung Galaxy Tab') + + # Get the search button and prepare for navigation + search_button = await page.wait_for_selector('#nav-search-submit-button', timeout=1000) + + # Click with navigation waiting + await search_button.click() + + # Wait for search results to load + await page.wait_for_selector('[data-component-type="s-search-result"]', timeout=10000) + print("[HOOK] Search completed and results loaded!") + + except Exception as e: + print(f"[HOOK] Error during search operation: {str(e)}") + + return page + + # Use context manager for proper resource handling + async with AsyncWebCrawler(config=browser_config) as crawler: + + crawler.crawler_strategy.set_hook("after_goto", after_goto) + + # Extract the data + result = await crawler.arun(url=url, config=crawler_config) + + # Process and print the results + if result and result.extracted_content: + # Parse the JSON string into a list of products + products = json.loads(result.extracted_content) + + # Process each product in the list + for product in products: + print("\nProduct Details:") + print(f"ASIN: {product.get('asin')}") + print(f"Title: {product.get('title')}") + print(f"Price: {product.get('price')}") + print(f"Original Price: {product.get('original_price')}") + print(f"Rating: {product.get('rating')}") + print(f"Reviews: {product.get('reviews_count')}") + print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}") + if product.get('delivery_info'): + print(f"Delivery: {' '.join(product['delivery_info'])}") + print("-" * 80) + +if __name__ == "__main__": + import asyncio + asyncio.run(extract_amazon_products()) diff --git a/docs/examples/amazon_product_extraction_using_use_javascript.py b/docs/examples/amazon_product_extraction_using_use_javascript.py new file mode 100644 index 0000000000000000000000000000000000000000..15e5d6f59a726852d38357a3ab16d523d3a108de --- /dev/null +++ b/docs/examples/amazon_product_extraction_using_use_javascript.py @@ -0,0 +1,129 @@ +""" +This example demonstrates how to use JSON CSS extraction to scrape product information +from Amazon search results. It shows how to extract structured data like product titles, +prices, ratings, and other details using CSS selectors. +""" + +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +import json +from playwright.async_api import Page, BrowserContext + +async def extract_amazon_products(): + # Initialize browser config + browser_config = BrowserConfig( + # browser_type="chromium", + headless=True + ) + + js_code_to_search = """ + const task = async () => { + document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab'; + document.querySelector('#nav-search-submit-button').click(); + } + await task(); + """ + js_code_to_search_sync = """ + document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab'; + document.querySelector('#nav-search-submit-button').click(); + """ + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + js_code = js_code_to_search, + wait_for='css:[data-component-type="s-search-result"]', + extraction_strategy=JsonCssExtractionStrategy( + schema={ + "name": "Amazon Product Search Results", + "baseSelector": "[data-component-type='s-search-result']", + "fields": [ + { + "name": "asin", + "selector": "", + "type": "attribute", + "attribute": "data-asin" + }, + { + "name": "title", + "selector": "h2 a span", + "type": "text" + }, + { + "name": "url", + "selector": "h2 a", + "type": "attribute", + "attribute": "href" + }, + { + "name": "image", + "selector": ".s-image", + "type": "attribute", + "attribute": "src" + }, + { + "name": "rating", + "selector": ".a-icon-star-small .a-icon-alt", + "type": "text" + }, + { + "name": "reviews_count", + "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span", + "type": "text" + }, + { + "name": "price", + "selector": ".a-price .a-offscreen", + "type": "text" + }, + { + "name": "original_price", + "selector": ".a-price.a-text-price .a-offscreen", + "type": "text" + }, + { + "name": "sponsored", + "selector": ".puis-sponsored-label-text", + "type": "exists" + }, + { + "name": "delivery_info", + "selector": "[data-cy='delivery-recipe'] .a-color-base", + "type": "text", + "multiple": True + } + ] + } + ) + ) + + # Example search URL (you should replace with your actual Amazon URL) + url = "https://www.amazon.com/" + + + # Use context manager for proper resource handling + async with AsyncWebCrawler(config=browser_config) as crawler: + # Extract the data + result = await crawler.arun(url=url, config=crawler_config) + + # Process and print the results + if result and result.extracted_content: + # Parse the JSON string into a list of products + products = json.loads(result.extracted_content) + + # Process each product in the list + for product in products: + print("\nProduct Details:") + print(f"ASIN: {product.get('asin')}") + print(f"Title: {product.get('title')}") + print(f"Price: {product.get('price')}") + print(f"Original Price: {product.get('original_price')}") + print(f"Rating: {product.get('rating')}") + print(f"Reviews: {product.get('reviews_count')}") + print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}") + if product.get('delivery_info'): + print(f"Delivery: {' '.join(product['delivery_info'])}") + print("-" * 80) + +if __name__ == "__main__": + import asyncio + asyncio.run(extract_amazon_products()) diff --git a/docs/examples/assets/audio.mp3 b/docs/examples/assets/audio.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..299149c6dec722f2a7274c2894bdde12d31107ef Binary files /dev/null and b/docs/examples/assets/audio.mp3 differ diff --git a/docs/examples/assets/basic.png b/docs/examples/assets/basic.png new file mode 100644 index 0000000000000000000000000000000000000000..ea68852bb48ff94699e35e93becb045776d0085f Binary files /dev/null and b/docs/examples/assets/basic.png differ diff --git a/docs/examples/assets/cosine_extraction.png b/docs/examples/assets/cosine_extraction.png new file mode 100644 index 0000000000000000000000000000000000000000..19252ad44da8dcefd409ce643e1f46f20e8d15a3 Binary files /dev/null and b/docs/examples/assets/cosine_extraction.png differ diff --git a/docs/examples/assets/css_js.png b/docs/examples/assets/css_js.png new file mode 100644 index 0000000000000000000000000000000000000000..9c0d2e60fef6a850badf251717c8bced2944ca36 Binary files /dev/null and b/docs/examples/assets/css_js.png differ diff --git a/docs/examples/assets/css_selector.png b/docs/examples/assets/css_selector.png new file mode 100644 index 0000000000000000000000000000000000000000..39357bb920182744dfce5509b5f289829eca2aba Binary files /dev/null and b/docs/examples/assets/css_selector.png differ diff --git a/docs/examples/assets/exec_script.png b/docs/examples/assets/exec_script.png new file mode 100644 index 0000000000000000000000000000000000000000..c2e478f70984ba3445629a793a7424a1feab8a64 Binary files /dev/null and b/docs/examples/assets/exec_script.png differ diff --git a/docs/examples/assets/llm_extraction.png b/docs/examples/assets/llm_extraction.png new file mode 100644 index 0000000000000000000000000000000000000000..95d2accb9f1d66a6e6f614cf6c15f103962e6476 Binary files /dev/null and b/docs/examples/assets/llm_extraction.png differ diff --git a/docs/examples/assets/semantic_extraction_cosine.png b/docs/examples/assets/semantic_extraction_cosine.png new file mode 100644 index 0000000000000000000000000000000000000000..eace4cf502eda675abcc3be9bd790a210a344538 Binary files /dev/null and b/docs/examples/assets/semantic_extraction_cosine.png differ diff --git a/docs/examples/assets/semantic_extraction_llm.png b/docs/examples/assets/semantic_extraction_llm.png new file mode 100644 index 0000000000000000000000000000000000000000..1dba8bc6f73cb361942ac24faed85c042f5f474b Binary files /dev/null and b/docs/examples/assets/semantic_extraction_llm.png differ diff --git a/docs/examples/async_webcrawler_multiple_urls_example.py b/docs/examples/async_webcrawler_multiple_urls_example.py new file mode 100644 index 0000000000000000000000000000000000000000..1d63ac80f364c42ed37c24b902fe46dfd4113538 --- /dev/null +++ b/docs/examples/async_webcrawler_multiple_urls_example.py @@ -0,0 +1,48 @@ +# File: async_webcrawler_multiple_urls_example.py +import os, sys +# append 2 parent directories to sys.path to import crawl4ai +parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.append(parent_dir) + +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + # Initialize the AsyncWebCrawler + async with AsyncWebCrawler(verbose=True) as crawler: + # List of URLs to crawl + urls = [ + "https://example.com", + "https://python.org", + "https://github.com", + "https://stackoverflow.com", + "https://news.ycombinator.com" + ] + + # Set up crawling parameters + word_count_threshold = 100 + + # Run the crawling process for multiple URLs + results = await crawler.arun_many( + urls=urls, + word_count_threshold=word_count_threshold, + bypass_cache=True, + verbose=True + ) + + # Process the results + for result in results: + if result.success: + print(f"Successfully crawled: {result.url}") + print(f"Title: {result.metadata.get('title', 'N/A')}") + print(f"Word count: {len(result.markdown.split())}") + print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}") + print(f"Number of images: {len(result.media.get('images', []))}") + print("---") + else: + print(f"Failed to crawl: {result.url}") + print(f"Error: {result.error_message}") + print("---") + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/browser_optimization_example.py b/docs/examples/browser_optimization_example.py new file mode 100644 index 0000000000000000000000000000000000000000..f57dc14782172e640fde80d2500d5a2aef9d2b26 --- /dev/null +++ b/docs/examples/browser_optimization_example.py @@ -0,0 +1,128 @@ +""" +This example demonstrates optimal browser usage patterns in Crawl4AI: +1. Sequential crawling with session reuse +2. Parallel crawling with browser instance reuse +3. Performance optimization settings +""" + +import asyncio +import os +from typing import List +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + + +async def crawl_sequential(urls: List[str]): + """ + Sequential crawling using session reuse - most efficient for moderate workloads + """ + print("\n=== Sequential Crawling with Session Reuse ===") + + # Configure browser with optimized settings + browser_config = BrowserConfig( + headless=True, + browser_args=[ + "--disable-gpu", # Disable GPU acceleration + "--disable-dev-shm-usage", # Disable /dev/shm usage + "--no-sandbox", # Required for Docker + ], + viewport={ + "width": 800, + "height": 600, + }, # Smaller viewport for better performance + ) + + # Configure crawl settings + crawl_config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + # content_filter=PruningContentFilter(), In case you need fit_markdown + ), + ) + + # Create single crawler instance + crawler = AsyncWebCrawler(config=browser_config) + await crawler.start() + + try: + session_id = "session1" # Use same session for all URLs + for url in urls: + result = await crawler.arun( + url=url, + config=crawl_config, + session_id=session_id, # Reuse same browser tab + ) + if result.success: + print(f"Successfully crawled {url}") + print(f"Content length: {len(result.markdown_v2.raw_markdown)}") + finally: + await crawler.close() + + +async def crawl_parallel(urls: List[str], max_concurrent: int = 3): + """ + Parallel crawling while reusing browser instance - best for large workloads + """ + print("\n=== Parallel Crawling with Browser Reuse ===") + + browser_config = BrowserConfig( + headless=True, + browser_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"], + viewport={"width": 800, "height": 600}, + ) + + crawl_config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + # content_filter=PruningContentFilter(), In case you need fit_markdown + ), + ) + + # Create single crawler instance for all parallel tasks + crawler = AsyncWebCrawler(config=browser_config) + await crawler.start() + + try: + # Create tasks in batches to control concurrency + for i in range(0, len(urls), max_concurrent): + batch = urls[i : i + max_concurrent] + tasks = [] + + for j, url in enumerate(batch): + session_id = ( + f"parallel_session_{j}" # Different session per concurrent task + ) + task = crawler.arun(url=url, config=crawl_config, session_id=session_id) + tasks.append(task) + + # Wait for batch to complete + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Process results + for url, result in zip(batch, results): + if isinstance(result, Exception): + print(f"Error crawling {url}: {str(result)}") + elif result.success: + print(f"Successfully crawled {url}") + print(f"Content length: {len(result.markdown_v2.raw_markdown)}") + finally: + await crawler.close() + + +async def main(): + # Example URLs + urls = [ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3", + "https://example.com/page4", + ] + + # Demo sequential crawling + await crawl_sequential(urls) + + # Demo parallel crawling + await crawl_parallel(urls, max_concurrent=2) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/chainlit.md b/docs/examples/chainlit.md new file mode 100644 index 0000000000000000000000000000000000000000..3b34b02f459a4e1bb038a0abc8cf5b59f8d80b76 --- /dev/null +++ b/docs/examples/chainlit.md @@ -0,0 +1,3 @@ +# Welcome to Crawl4AI! 🚀🤖 + +Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context. \ No newline at end of file diff --git a/docs/examples/crawlai_vs_firecrawl.py b/docs/examples/crawlai_vs_firecrawl.py new file mode 100644 index 0000000000000000000000000000000000000000..b50b06dac8e1b7218ff63bd76fb28ee153d1c47e --- /dev/null +++ b/docs/examples/crawlai_vs_firecrawl.py @@ -0,0 +1,67 @@ +import os, time +# append the path to the root of the project +import sys +import asyncio +sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) +from firecrawl import FirecrawlApp +from crawl4ai import AsyncWebCrawler +__data__ = os.path.join(os.path.dirname(__file__), '..', '..') + '/.data' + +async def compare(): + app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY']) + + # Tet Firecrawl with a simple crawl + start = time.time() + scrape_status = app.scrape_url( + 'https://www.nbcnews.com/business', + params={'formats': ['markdown', 'html']} + ) + end = time.time() + print(f"Time taken: {end - start} seconds") + print(len(scrape_status['markdown'])) + # save the markdown content with provider name + with open(f"{__data__}/firecrawl_simple.md", "w") as f: + f.write(scrape_status['markdown']) + # Count how many "cldnry.s-nbcnews.com" are in the markdown + print(scrape_status['markdown'].count("cldnry.s-nbcnews.com")) + + + + async with AsyncWebCrawler() as crawler: + start = time.time() + result = await crawler.arun( + url="https://www.nbcnews.com/business", + # js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"], + word_count_threshold=0, + bypass_cache=True, + verbose=False + ) + end = time.time() + print(f"Time taken: {end - start} seconds") + print(len(result.markdown)) + # save the markdown content with provider name + with open(f"{__data__}/crawl4ai_simple.md", "w") as f: + f.write(result.markdown) + # count how many "cldnry.s-nbcnews.com" are in the markdown + print(result.markdown.count("cldnry.s-nbcnews.com")) + + start = time.time() + result = await crawler.arun( + url="https://www.nbcnews.com/business", + js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"], + word_count_threshold=0, + bypass_cache=True, + verbose=False + ) + end = time.time() + print(f"Time taken: {end - start} seconds") + print(len(result.markdown)) + # save the markdown content with provider name + with open(f"{__data__}/crawl4ai_js.md", "w") as f: + f.write(result.markdown) + # count how many "cldnry.s-nbcnews.com" are in the markdown + print(result.markdown.count("cldnry.s-nbcnews.com")) + +if __name__ == "__main__": + asyncio.run(compare()) + \ No newline at end of file diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py new file mode 100644 index 0000000000000000000000000000000000000000..48acc80995c44493d66f4ccfc4df721543113066 --- /dev/null +++ b/docs/examples/docker_example.py @@ -0,0 +1,357 @@ +import requests +import json +import time +import sys +import base64 +import os +from typing import Dict, Any + +class Crawl4AiTester: + def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None): + self.base_url = base_url + self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" # Check environment variable as fallback + self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {} + + def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: + # Submit crawl job + response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers) + if response.status_code == 403: + raise Exception("API token is invalid or missing") + task_id = response.json()["task_id"] + print(f"Task ID: {task_id}") + + # Poll for result + start_time = time.time() + while True: + if time.time() - start_time > timeout: + raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds") + + result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers) + status = result.json() + + if status["status"] == "failed": + print("Task failed:", status.get("error")) + raise Exception(f"Task failed: {status.get('error')}") + + if status["status"] == "completed": + return status + + time.sleep(2) + + def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]: + response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60) + if response.status_code == 408: + raise TimeoutError("Task did not complete within server timeout") + response.raise_for_status() + return response.json() + + def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]: + """Directly crawl without using task queue""" + response = requests.post( + f"{self.base_url}/crawl_direct", + json=request_data, + headers=self.headers + ) + response.raise_for_status() + return response.json() + +def test_docker_deployment(version="basic"): + tester = Crawl4AiTester( + base_url="http://localhost:11235" , + # base_url="https://api.crawl4ai.com" # just for example + # api_token="test" # just for example + ) + print(f"Testing Crawl4AI Docker {version} version") + + # Health check with timeout and retry + max_retries = 5 + for i in range(max_retries): + try: + health = requests.get(f"{tester.base_url}/health", timeout=10) + print("Health check:", health.json()) + break + except requests.exceptions.RequestException as e: + if i == max_retries - 1: + print(f"Failed to connect after {max_retries} attempts") + sys.exit(1) + print(f"Waiting for service to start (attempt {i+1}/{max_retries})...") + time.sleep(5) + + # Test cases based on version + test_basic_crawl_direct(tester) + test_basic_crawl(tester) + test_basic_crawl(tester) + test_basic_crawl_sync(tester) + + if version in ["full", "transformer"]: + test_cosine_extraction(tester) + + test_js_execution(tester) + test_css_selector(tester) + test_structured_extraction(tester) + test_llm_extraction(tester) + test_llm_with_ollama(tester) + test_screenshot(tester) + + +def test_basic_crawl(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10, + "session_id": "test" + } + + result = tester.submit_and_wait(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result["result"]["success"] + assert len(result["result"]["markdown"]) > 0 + +def test_basic_crawl_sync(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl (Sync) ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10, + "session_id": "test" + } + + result = tester.submit_sync(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result['status'] == 'completed' + assert result['result']['success'] + assert len(result['result']['markdown']) > 0 + +def test_basic_crawl_direct(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl (Direct) ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10, + # "session_id": "test" + "cache_mode": "bypass" # or "enabled", "disabled", "read_only", "write_only" + } + + result = tester.crawl_direct(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result['result']['success'] + assert len(result['result']['markdown']) > 0 + +def test_js_execution(tester: Crawl4AiTester): + print("\n=== Testing JS Execution ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 8, + "js_code": [ + "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" + ], + "wait_for": "article.tease-card:nth-child(10)", + "crawler_params": { + "headless": True + } + } + + result = tester.submit_and_wait(request) + print(f"JS execution result length: {len(result['result']['markdown'])}") + assert result["result"]["success"] + +def test_css_selector(tester: Crawl4AiTester): + print("\n=== Testing CSS Selector ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 7, + "css_selector": ".wide-tease-item__description", + "crawler_params": { + "headless": True + }, + "extra": {"word_count_threshold": 10} + + } + + result = tester.submit_and_wait(request) + print(f"CSS selector result length: {len(result['result']['markdown'])}") + assert result["result"]["success"] + +def test_structured_extraction(tester: Crawl4AiTester): + print("\n=== Testing Structured Extraction ===") + schema = { + "name": "Coinbase Crypto Prices", + "baseSelector": ".cds-tableRow-t45thuk", + "fields": [ + { + "name": "crypto", + "selector": "td:nth-child(1) h2", + "type": "text", + }, + { + "name": "symbol", + "selector": "td:nth-child(1) p", + "type": "text", + }, + { + "name": "price", + "selector": "td:nth-child(2)", + "type": "text", + } + ], + } + + request = { + "urls": "https://www.coinbase.com/explore", + "priority": 9, + "extraction_config": { + "type": "json_css", + "params": { + "schema": schema + } + } + } + + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print(f"Extracted {len(extracted)} items") + print("Sample item:", json.dumps(extracted[0], indent=2)) + assert result["result"]["success"] + assert len(extracted) > 0 + +def test_llm_extraction(tester: Crawl4AiTester): + print("\n=== Testing LLM Extraction ===") + schema = { + "type": "object", + "properties": { + "model_name": { + "type": "string", + "description": "Name of the OpenAI model." + }, + "input_fee": { + "type": "string", + "description": "Fee for input token for the OpenAI model." + }, + "output_fee": { + "type": "string", + "description": "Fee for output token for the OpenAI model." + } + }, + "required": ["model_name", "input_fee", "output_fee"] + } + + request = { + "urls": "https://openai.com/api/pricing", + "priority": 8, + "extraction_config": { + "type": "llm", + "params": { + "provider": "openai/gpt-4o-mini", + "api_token": os.getenv("OPENAI_API_KEY"), + "schema": schema, + "extraction_type": "schema", + "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""" + } + }, + "crawler_params": {"word_count_threshold": 1} + } + + try: + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print(f"Extracted {len(extracted)} model pricing entries") + print("Sample entry:", json.dumps(extracted[0], indent=2)) + assert result["result"]["success"] + except Exception as e: + print(f"LLM extraction test failed (might be due to missing API key): {str(e)}") + +def test_llm_with_ollama(tester: Crawl4AiTester): + print("\n=== Testing LLM with Ollama ===") + schema = { + "type": "object", + "properties": { + "article_title": { + "type": "string", + "description": "The main title of the news article" + }, + "summary": { + "type": "string", + "description": "A brief summary of the article content" + }, + "main_topics": { + "type": "array", + "items": {"type": "string"}, + "description": "Main topics or themes discussed in the article" + } + } + } + + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 8, + "extraction_config": { + "type": "llm", + "params": { + "provider": "ollama/llama2", + "schema": schema, + "extraction_type": "schema", + "instruction": "Extract the main article information including title, summary, and main topics." + } + }, + "extra": {"word_count_threshold": 1}, + "crawler_params": {"verbose": True} + } + + try: + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print("Extracted content:", json.dumps(extracted, indent=2)) + assert result["result"]["success"] + except Exception as e: + print(f"Ollama extraction test failed: {str(e)}") + +def test_cosine_extraction(tester: Crawl4AiTester): + print("\n=== Testing Cosine Extraction ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 8, + "extraction_config": { + "type": "cosine", + "params": { + "semantic_filter": "business finance economy", + "word_count_threshold": 10, + "max_dist": 0.2, + "top_k": 3 + } + } + } + + try: + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print(f"Extracted {len(extracted)} text clusters") + print("First cluster tags:", extracted[0]["tags"]) + assert result["result"]["success"] + except Exception as e: + print(f"Cosine extraction test failed: {str(e)}") + +def test_screenshot(tester: Crawl4AiTester): + print("\n=== Testing Screenshot ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 5, + "screenshot": True, + "crawler_params": { + "headless": True + } + } + + result = tester.submit_and_wait(request) + print("Screenshot captured:", bool(result["result"]["screenshot"])) + + if result["result"]["screenshot"]: + # Save screenshot + screenshot_data = base64.b64decode(result["result"]["screenshot"]) + with open("test_screenshot.jpg", "wb") as f: + f.write(screenshot_data) + print("Screenshot saved as test_screenshot.jpg") + + assert result["result"]["success"] + +if __name__ == "__main__": + version = sys.argv[1] if len(sys.argv) > 1 else "basic" + # version = "full" + test_docker_deployment(version) \ No newline at end of file diff --git a/docs/examples/extraction_strategies_example.py b/docs/examples/extraction_strategies_example.py new file mode 100644 index 0000000000000000000000000000000000000000..348b891ee1290e8c5e10a8d1c53b49cd02ac7606 --- /dev/null +++ b/docs/examples/extraction_strategies_example.py @@ -0,0 +1,115 @@ +""" +Example demonstrating different extraction strategies with various input formats. +This example shows how to: +1. Use different input formats (markdown, HTML, fit_markdown) +2. Work with JSON-based extractors (CSS and XPath) +3. Use LLM-based extraction with different input formats +4. Configure browser and crawler settings properly +""" + +import asyncio +import os +from typing import Dict, Any + +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import ( + LLMExtractionStrategy, + JsonCssExtractionStrategy, + JsonXPathExtractionStrategy +) +from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str): + """Helper function to run extraction with proper configuration""" + try: + # Configure the crawler run settings + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=strategy, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter() # For fit_markdown support + ) + ) + + # Run the crawler + result = await crawler.arun(url=url, config=config) + + if result.success: + print(f"\n=== {name} Results ===") + print(f"Extracted Content: {result.extracted_content}") + print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}") + print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}") + else: + print(f"Error in {name}: Crawl failed") + + except Exception as e: + print(f"Error in {name}: {str(e)}") + +async def main(): + # Example URL (replace with actual URL) + url = "https://example.com/product-page" + + # Configure browser settings + browser_config = BrowserConfig( + headless=True, + verbose=True + ) + + # Initialize extraction strategies + + # 1. LLM Extraction with different input formats + markdown_strategy = LLMExtractionStrategy( + provider="openai/gpt-4o-mini", + api_token=os.getenv("OPENAI_API_KEY"), + instruction="Extract product information including name, price, and description" + ) + + html_strategy = LLMExtractionStrategy( + input_format="html", + provider="openai/gpt-4o-mini", + api_token=os.getenv("OPENAI_API_KEY"), + instruction="Extract product information from HTML including structured data" + ) + + fit_markdown_strategy = LLMExtractionStrategy( + input_format="fit_markdown", + provider="openai/gpt-4o-mini", + api_token=os.getenv("OPENAI_API_KEY"), + instruction="Extract product information from cleaned markdown" + ) + + # 2. JSON CSS Extraction (automatically uses HTML input) + css_schema = { + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": "h1.product-title", "type": "text"}, + {"name": "price", "selector": ".price", "type": "text"}, + {"name": "description", "selector": ".description", "type": "text"} + ] + } + css_strategy = JsonCssExtractionStrategy(schema=css_schema) + + # 3. JSON XPath Extraction (automatically uses HTML input) + xpath_schema = { + "baseSelector": "//div[@class='product']", + "fields": [ + {"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"}, + {"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"}, + {"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"} + ] + } + xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema) + + # Use context manager for proper resource handling + async with AsyncWebCrawler(config=browser_config) as crawler: + # Run all strategies + await run_extraction(crawler, url, markdown_strategy, "Markdown LLM") + await run_extraction(crawler, url, html_strategy, "HTML LLM") + await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM") + await run_extraction(crawler, url, css_strategy, "CSS Extraction") + await run_extraction(crawler, url, xpath_strategy, "XPath Extraction") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/full_page_screenshot_and_pdf_export.md b/docs/examples/full_page_screenshot_and_pdf_export.md new file mode 100644 index 0000000000000000000000000000000000000000..8522675c3537b29fc1fbbbb5b8aa378e1974d78a --- /dev/null +++ b/docs/examples/full_page_screenshot_and_pdf_export.md @@ -0,0 +1,58 @@ +# Capturing Full-Page Screenshots and PDFs from Massive Webpages with Crawl4AI + +When dealing with very long web pages, traditional full-page screenshots can be slow or fail entirely. For large pages (like extensive Wikipedia articles), generating a single massive screenshot often leads to delays, memory issues, or style differences. + +**The New Approach:** +We’ve introduced a new feature that effortlessly handles even the biggest pages by first exporting them as a PDF, then converting that PDF into a high-quality image. This approach leverages the browser’s built-in PDF rendering, making it both stable and efficient for very long content. You also have the option to directly save the PDF for your own usage—no need for multiple passes or complex stitching logic. + +**Key Benefits:** +- **Reliability:** The PDF export never times out and works regardless of page length. +- **Versatility:** Get both the PDF and a screenshot in one crawl, without reloading or reprocessing. +- **Performance:** Skips manual scrolling and stitching images, reducing complexity and runtime. + +**Simple Example:** +```python +import os, sys +import asyncio +from crawl4ai import AsyncWebCrawler, CacheMode + +# Adjust paths as needed +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +async def main(): + async with AsyncWebCrawler() as crawler: + # Request both PDF and screenshot + result = await crawler.arun( + url='https://en.wikipedia.org/wiki/List_of_common_misconceptions', + cache_mode=CacheMode.BYPASS, + pdf=True, + screenshot=True + ) + + if result.success: + # Save screenshot + if result.screenshot: + from base64 import b64decode + with open(os.path.join(__location__, "screenshot.png"), "wb") as f: + f.write(b64decode(result.screenshot)) + + # Save PDF + if result.pdf: + pdf_bytes = b64decode(result.pdf) + with open(os.path.join(__location__, "page.pdf"), "wb") as f: + f.write(pdf_bytes) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**What Happens Under the Hood:** +- Crawl4AI navigates to the target page. +- If `pdf=True`, it exports the current page as a full PDF, capturing all of its content no matter the length. +- If `screenshot=True`, and a PDF is already available, it directly converts the first page of that PDF to an image for you—no repeated loading or scrolling. +- Finally, you get your PDF and/or screenshot ready to use. + +**Conclusion:** +With this feature, Crawl4AI becomes even more robust and versatile for large-scale content extraction. Whether you need a PDF snapshot or a quick screenshot, you now have a reliable solution for even the most extensive webpages. \ No newline at end of file diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py new file mode 100644 index 0000000000000000000000000000000000000000..18534d0e09ad7c6054cfc91ccd4c694143691530 --- /dev/null +++ b/docs/examples/hello_world.py @@ -0,0 +1,20 @@ +import asyncio +from crawl4ai import * + +async def main(): + browser_config = BrowserConfig(headless=True, verbose=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0) + ) + ) + result = await crawler.arun( + url="https://www.helloworld.org", + config=crawler_config + ) + print(result.markdown_v2.raw_markdown[:500]) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/hooks_example.py b/docs/examples/hooks_example.py new file mode 100644 index 0000000000000000000000000000000000000000..09e0bc17d204b0c9f14eea6fc1f559eebf11720a --- /dev/null +++ b/docs/examples/hooks_example.py @@ -0,0 +1,107 @@ +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from playwright.async_api import Page, BrowserContext + +async def main(): + print("🔗 Hooks Example: Demonstrating different hook use cases") + + # Configure browser settings + browser_config = BrowserConfig( + headless=True + ) + + # Configure crawler settings + crawler_run_config = CrawlerRunConfig( + js_code="window.scrollTo(0, document.body.scrollHeight);", + wait_for="body", + cache_mode=CacheMode.BYPASS + ) + + # Create crawler instance + crawler = AsyncWebCrawler(config=browser_config) + + # Define and set hook functions + async def on_browser_created(browser, context: BrowserContext, **kwargs): + """Hook called after the browser is created""" + print("[HOOK] on_browser_created - Browser is ready!") + # Example: Set a cookie that will be used for all requests + return browser + + async def on_page_context_created(page: Page, context: BrowserContext, **kwargs): + """Hook called after a new page and context are created""" + print("[HOOK] on_page_context_created - New page created!") + # Example: Set default viewport size + await context.add_cookies([{ + 'name': 'session_id', + 'value': 'example_session', + 'domain': '.example.com', + 'path': '/' + }]) + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page + + async def on_user_agent_updated(page: Page, context: BrowserContext, user_agent: str, **kwargs): + """Hook called when the user agent is updated""" + print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}") + return page + + async def on_execution_started(page: Page, context: BrowserContext, **kwargs): + """Hook called after custom JavaScript execution""" + print("[HOOK] on_execution_started - Custom JS executed!") + return page + + async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs): + """Hook called before navigating to each URL""" + print(f"[HOOK] before_goto - About to visit: {url}") + # Example: Add custom headers for the request + await page.set_extra_http_headers({ + "Custom-Header": "my-value" + }) + return page + + async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs): + """Hook called after navigating to each URL""" + print(f"[HOOK] after_goto - Successfully loaded: {url}") + # Example: Wait for a specific element to be loaded + try: + await page.wait_for_selector('.content', timeout=1000) + print("Content element found!") + except: + print("Content element not found, continuing anyway") + return page + + async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs): + """Hook called before retrieving the HTML content""" + print("[HOOK] before_retrieve_html - About to get HTML content") + # Example: Scroll to bottom to trigger lazy loading + await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + return page + + async def before_return_html(page: Page, context: BrowserContext, html:str, **kwargs): + """Hook called before returning the HTML content""" + print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})") + # Example: You could modify the HTML content here if needed + return page + + # Set all the hooks + crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) + crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) + crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated) + crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) + crawler.crawler_strategy.set_hook("before_goto", before_goto) + crawler.crawler_strategy.set_hook("after_goto", after_goto) + crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html) + crawler.crawler_strategy.set_hook("before_return_html", before_return_html) + + await crawler.start() + + # Example usage: crawl a simple website + url = 'https://example.com' + result = await crawler.arun(url, config=crawler_run_config) + print(f"\nCrawled URL: {result.url}") + print(f"HTML length: {len(result.html)}") + + await crawler.close() + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/language_support_example.py b/docs/examples/language_support_example.py new file mode 100644 index 0000000000000000000000000000000000000000..b74a8402e81263edc83e2ecb04454bcd4774f52b --- /dev/null +++ b/docs/examples/language_support_example.py @@ -0,0 +1,45 @@ +import asyncio +from crawl4ai import AsyncWebCrawler, AsyncPlaywrightCrawlerStrategy + +async def main(): + # Example 1: Setting language when creating the crawler + crawler1 = AsyncWebCrawler( + crawler_strategy=AsyncPlaywrightCrawlerStrategy( + headers={"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7"} + ) + ) + result1 = await crawler1.arun("https://www.example.com") + print("Example 1 result:", result1.extracted_content[:100]) # Print first 100 characters + + # Example 2: Setting language before crawling + crawler2 = AsyncWebCrawler() + crawler2.crawler_strategy.headers["Accept-Language"] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7" + result2 = await crawler2.arun("https://www.example.com") + print("Example 2 result:", result2.extracted_content[:100]) + + # Example 3: Setting language when calling arun method + crawler3 = AsyncWebCrawler() + result3 = await crawler3.arun( + "https://www.example.com", + headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"} + ) + print("Example 3 result:", result3.extracted_content[:100]) + + # Example 4: Crawling multiple pages with different languages + urls = [ + ("https://www.example.com", "fr-FR,fr;q=0.9"), + ("https://www.example.org", "es-ES,es;q=0.9"), + ("https://www.example.net", "de-DE,de;q=0.9"), + ] + + crawler4 = AsyncWebCrawler() + results = await asyncio.gather(*[ + crawler4.arun(url, headers={"Accept-Language": lang}) + for url, lang in urls + ]) + + for url, result in zip([u for u, _ in urls], results): + print(f"Result for {url}:", result.extracted_content[:100]) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py new file mode 100644 index 0000000000000000000000000000000000000000..5ae3d4d1f68e96d85483468bee24bd084599bf26 --- /dev/null +++ b/docs/examples/llm_extraction_openai_pricing.py @@ -0,0 +1,40 @@ +from crawl4ai.extraction_strategy import * +from crawl4ai.crawler_strategy import * +import asyncio +from pydantic import BaseModel, Field + +url = r'https://openai.com/api/pricing/' + +class OpenAIModelFee(BaseModel): + model_name: str = Field(..., description="Name of the OpenAI model.") + input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") + output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") + +from crawl4ai import AsyncWebCrawler + +async def main(): + # Use AsyncWebCrawler + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url=url, + word_count_threshold=1, + extraction_strategy= LLMExtractionStrategy( + # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), + provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'), + schema=OpenAIModelFee.model_json_schema(), + extraction_type="schema", + instruction="From the crawled content, extract all mentioned model names along with their " \ + "fees for input and output tokens. Make sure not to miss anything in the entire content. " \ + 'One extracted model JSON format should look like this: ' \ + '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }' + ), + + ) + print("Success:", result.success) + model_fees = json.loads(result.extracted_content) + print(len(model_fees)) + + with open(".data/data.json", "w", encoding="utf-8") as f: + f.write(result.extracted_content) + +asyncio.run(main()) diff --git a/docs/examples/quickstart.ipynb b/docs/examples/quickstart.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..4751dec8b9c2c55df265d4f1440eec0a14c9ddb1 --- /dev/null +++ b/docs/examples/quickstart.ipynb @@ -0,0 +1,664 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0cba38e5", + "metadata": {}, + "source": [ + "# Crawl4AI 🕷️🤖\n", + "\"unclecode%2Fcrawl4ai\n", + "\n", + "[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)\n", + "![PyPI - Downloads](https://img.shields.io/pypi/dm/Crawl4AI)\n", + "[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)\n", + "[![GitHub Issues](https://img.shields.io/github/issues/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/issues)\n", + "[![GitHub Pull Requests](https://img.shields.io/github/issues-pr/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/pulls)\n", + "[![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)\n", + "\n", + "Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐\n", + "\n", + "- GitHub Repository: [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)\n", + "- Twitter: [@unclecode](https://twitter.com/unclecode)\n", + "- Website: [https://crawl4ai.com](https://crawl4ai.com)\n", + "\n", + "## 🌟 Meet the Crawl4AI Assistant: Your Copilot for Crawling\n", + "Use the [Crawl4AI GPT Assistant](https://tinyurl.com/crawl4ai-gpt) as your AI-powered copilot! With this assistant, you can:\n", + "- 🧑‍💻 Generate code for complex crawling and extraction tasks\n", + "- 💡 Get tailored support and examples\n", + "- 📘 Learn Crawl4AI faster with step-by-step guidance" + ] + }, + { + "cell_type": "markdown", + "id": "41de6458", + "metadata": {}, + "source": [ + "### **Quickstart with Crawl4AI**" + ] + }, + { + "cell_type": "markdown", + "id": "1380e951", + "metadata": {}, + "source": [ + "#### 1. **Installation**\n", + "Install Crawl4AI and necessary dependencies:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05fecfad", + "metadata": {}, + "outputs": [], + "source": [ + "# %%capture\n", + "!pip install crawl4ai\n", + "!pip install nest_asyncio\n", + "!playwright install " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2c2a74c8", + "metadata": {}, + "outputs": [], + "source": [ + "import asyncio\n", + "import nest_asyncio\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "markdown", + "id": "f3c558d7", + "metadata": {}, + "source": [ + "#### 2. **Basic Setup and Simple Crawl**" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "003376f3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 1.49 seconds\n", + "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.10 seconds\n", + "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n", + "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.10 seconds.\n", + "IE 11 is not supported. For an optimal experience visit our site on another browser.\n", + "\n", + "[Morning Rundown: Trump and Harris' vastly different closing pitches, why Kim Jong Un is helping Russia, and an ancient city is discovered by accident](https://www.nbcnews.com/news/harris-speech-ellipse-ancient-mayan-city-morning-rundown-rcna177973)[](https://www.nbcnews.com/news/harris-speech-ellipse-ancient-mayan-city-morning-rundown-rcna177973)\n", + "\n", + "Skip to Content\n", + "\n", + "[NBC News Logo](https://www.nbcnews.com)\n", + "\n", + "Spon\n" + ] + } + ], + "source": [ + "import asyncio\n", + "from crawl4ai import AsyncWebCrawler\n", + "\n", + "async def simple_crawl():\n", + " async with AsyncWebCrawler() as crawler:\n", + " result = await crawler.arun(\n", + " url=\"https://www.nbcnews.com/business\",\n", + " bypass_cache=True # By default this is False, meaning the cache will be used\n", + " )\n", + " print(result.markdown[:500]) # Print the first 500 characters\n", + " \n", + "asyncio.run(simple_crawl())" + ] + }, + { + "cell_type": "markdown", + "id": "da9b4d50", + "metadata": {}, + "source": [ + "#### 3. **Dynamic Content Handling**" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5bb8c1e4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LOG] 🌤️ Warming up the AsyncWebCrawler\n", + "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n", + "[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n", + "[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n", + "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 4.52 seconds\n", + "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.15 seconds\n", + "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n", + "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.15 seconds.\n", + "IE 11 is not supported. For an optimal experience visit our site on another browser.\n", + "\n", + "[Morning Rundown: Trump and Harris' vastly different closing pitches, why Kim Jong Un is helping Russia, and an ancient city is discovered by accident](https://www.nbcnews.com/news/harris-speech-ellipse-ancient-mayan-city-morning-rundown-rcna177973)[](https://www.nbcnews.com/news/harris-speech-ellipse-ancient-mayan-city-morning-rundown-rcna177973)\n", + "\n", + "Skip to Content\n", + "\n", + "[NBC News Logo](https://www.nbcnews.com)\n", + "\n", + "Spon\n" + ] + } + ], + "source": [ + "async def crawl_dynamic_content():\n", + " # You can use wait_for to wait for a condition to be met before returning the result\n", + " # wait_for = \"\"\"() => {\n", + " # return Array.from(document.querySelectorAll('article.tease-card')).length > 10;\n", + " # }\"\"\"\n", + "\n", + " # wait_for can be also just a css selector\n", + " # wait_for = \"article.tease-card:nth-child(10)\"\n", + "\n", + " async with AsyncWebCrawler(verbose=True) as crawler:\n", + " js_code = [\n", + " \"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"\n", + " ]\n", + " result = await crawler.arun(\n", + " url=\"https://www.nbcnews.com/business\",\n", + " js_code=js_code,\n", + " # wait_for=wait_for,\n", + " bypass_cache=True,\n", + " )\n", + " print(result.markdown[:500]) # Print first 500 characters\n", + "\n", + "asyncio.run(crawl_dynamic_content())" + ] + }, + { + "cell_type": "markdown", + "id": "86febd8d", + "metadata": {}, + "source": [ + "#### 4. **Content Cleaning and Fit Markdown**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e8ab01f", + "metadata": {}, + "outputs": [], + "source": [ + "async def clean_content():\n", + " async with AsyncWebCrawler() as crawler:\n", + " result = await crawler.arun(\n", + " url=\"https://janineintheworld.com/places-to-visit-in-central-mexico\",\n", + " excluded_tags=['nav', 'footer', 'aside'],\n", + " remove_overlay_elements=True,\n", + " word_count_threshold=10,\n", + " bypass_cache=True\n", + " )\n", + " full_markdown_length = len(result.markdown)\n", + " fit_markdown_length = len(result.fit_markdown)\n", + " print(f\"Full Markdown Length: {full_markdown_length}\")\n", + " print(f\"Fit Markdown Length: {fit_markdown_length}\")\n", + " print(result.fit_markdown[:1000])\n", + " \n", + "\n", + "asyncio.run(clean_content())" + ] + }, + { + "cell_type": "markdown", + "id": "55715146", + "metadata": {}, + "source": [ + "#### 5. **Link Analysis and Smart Filtering**" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "2ae47c69", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 0.93 seconds\n", + "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.11 seconds\n", + "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n", + "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.11 seconds.\n", + "Found 107 internal links\n", + "Found 58 external links\n", + "Href: https://www.nbcnews.com/news/harris-speech-ellipse-ancient-mayan-city-morning-rundown-rcna177973\n", + "Text: Morning Rundown: Trump and Harris' vastly different closing pitches, why Kim Jong Un is helping Russia, and an ancient city is discovered by accident\n", + "\n", + "Href: https://www.nbcnews.com\n", + "Text: NBC News Logo\n", + "\n", + "Href: https://www.nbcnews.com/politics/2024-election/live-blog/kamala-harris-donald-trump-rally-election-live-updates-rcna177529\n", + "Text: 2024 Election\n", + "\n", + "Href: https://www.nbcnews.com/politics\n", + "Text: Politics\n", + "\n", + "Href: https://www.nbcnews.com/us-news\n", + "Text: U.S. News\n", + "\n" + ] + } + ], + "source": [ + "\n", + "async def link_analysis():\n", + " async with AsyncWebCrawler() as crawler:\n", + " result = await crawler.arun(\n", + " url=\"https://www.nbcnews.com/business\",\n", + " bypass_cache=True,\n", + " exclude_external_links=True,\n", + " exclude_social_media_links=True,\n", + " # exclude_domains=[\"facebook.com\", \"twitter.com\"]\n", + " )\n", + " print(f\"Found {len(result.links['internal'])} internal links\")\n", + " print(f\"Found {len(result.links['external'])} external links\")\n", + "\n", + " for link in result.links['internal'][:5]:\n", + " print(f\"Href: {link['href']}\\nText: {link['text']}\\n\")\n", + " \n", + "\n", + "asyncio.run(link_analysis())" + ] + }, + { + "cell_type": "markdown", + "id": "80cceef3", + "metadata": {}, + "source": [ + "#### 6. **Media Handling**" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "1fed7f99", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 1.42 seconds\n", + "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.11 seconds\n", + "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n", + "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.12 seconds.\n", + "Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-762x508,f_auto,q_auto:best/rockcms/2024-10/241023-NM-Chilccare-jg-27b982.jpg, Alt: , Score: 4\n", + "Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-80x80,f_auto,q_auto:best/rockcms/2024-10/241030-china-ev-electric-mb-0746-cae05c.jpg, Alt: Volkswagen Workshop in Hefei, Score: 5\n", + "Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-80x80,f_auto,q_auto:best/rockcms/2024-10/241029-nyc-subway-sandwich-2021-ac-922p-a92374.jpg, Alt: A sub is prepared at a Subway restaurant in Manhattan, New York City, Score: 5\n", + "Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-80x80,f_auto,q_auto:best/rockcms/2024-10/241029-suv-gravity-ch-1618-752415.jpg, Alt: The Lucid Gravity car., Score: 5\n", + "Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-80x80,f_auto,q_auto:best/rockcms/2024-10/241029-dearborn-michigan-f-150-ford-ranger-trucks-assembly-line-ac-426p-614f0b.jpg, Alt: Ford Introduces new F-150 And Ranger Trucks At Their Dearborn Plant, Score: 5\n" + ] + } + ], + "source": [ + "async def media_handling():\n", + " async with AsyncWebCrawler() as crawler:\n", + " result = await crawler.arun(\n", + " url=\"https://www.nbcnews.com/business\", \n", + " bypass_cache=True,\n", + " exclude_external_images=False,\n", + " screenshot=True\n", + " )\n", + " for img in result.media['images'][:5]:\n", + " print(f\"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}\")\n", + " \n", + "asyncio.run(media_handling())" + ] + }, + { + "cell_type": "markdown", + "id": "9290499a", + "metadata": {}, + "source": [ + "#### 7. **Using Hooks for Custom Workflow**" + ] + }, + { + "cell_type": "markdown", + "id": "9d069c2b", + "metadata": {}, + "source": [ + "Hooks in Crawl4AI allow you to run custom logic at specific stages of the crawling process. This can be invaluable for scenarios like setting custom headers, logging activities, or processing content before it is returned. Below is an example of a basic workflow using a hook, followed by a complete list of available hooks and explanations on their usage." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "bc4d2fc8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Hook] Preparing to navigate...\n", + "[LOG] 🚀 Crawling done for https://crawl4ai.com, success: True, time taken: 3.49 seconds\n", + "[LOG] 🚀 Content extracted for https://crawl4ai.com, success: True, time taken: 0.03 seconds\n", + "[LOG] 🔥 Extracting semantic blocks for https://crawl4ai.com, Strategy: AsyncWebCrawler\n", + "[LOG] 🚀 Extraction done for https://crawl4ai.com, time taken: 0.03 seconds.\n", + "[Crawl4AI Documentation](https://docs.crawl4ai.com/)\n", + "\n", + " * [ Home ](.)\n", + " * [ Installation ](basic/installation/)\n", + " * [ Quick Start ](basic/quickstart/)\n", + " * [ Search ](#)\n", + "\n", + "\n", + "\n", + " * Home\n", + " * [Installation](basic/installation/)\n", + " * [Quick Start](basic/quickstart/)\n", + " * Basic\n", + " * [Simple Crawling](basic/simple-crawling/)\n", + " * [Output Formats](basic/output-formats/)\n", + " * [Browser Configuration](basic/browser-config/)\n", + " * [Page Interaction](basic/page-interaction/)\n", + " * [Content Selection](basic/con\n" + ] + } + ], + "source": [ + "async def custom_hook_workflow():\n", + " async with AsyncWebCrawler() as crawler:\n", + " # Set a 'before_goto' hook to run custom code just before navigation\n", + " crawler.crawler_strategy.set_hook(\"before_goto\", lambda page: print(\"[Hook] Preparing to navigate...\"))\n", + " \n", + " # Perform the crawl operation\n", + " result = await crawler.arun(\n", + " url=\"https://crawl4ai.com\",\n", + " bypass_cache=True\n", + " )\n", + " print(result.markdown[:500]) # Display the first 500 characters\n", + "\n", + "asyncio.run(custom_hook_workflow())" + ] + }, + { + "cell_type": "markdown", + "id": "3ff45e21", + "metadata": {}, + "source": [ + "List of available hooks and examples for each stage of the crawling process:\n", + "\n", + "- **on_browser_created**\n", + " ```python\n", + " async def on_browser_created_hook(browser):\n", + " print(\"[Hook] Browser created\")\n", + " ```\n", + "\n", + "- **before_goto**\n", + " ```python\n", + " async def before_goto_hook(page):\n", + " await page.set_extra_http_headers({\"X-Test-Header\": \"test\"})\n", + " ```\n", + "\n", + "- **after_goto**\n", + " ```python\n", + " async def after_goto_hook(page):\n", + " print(f\"[Hook] Navigated to {page.url}\")\n", + " ```\n", + "\n", + "- **on_execution_started**\n", + " ```python\n", + " async def on_execution_started_hook(page):\n", + " print(\"[Hook] JavaScript execution started\")\n", + " ```\n", + "\n", + "- **before_return_html**\n", + " ```python\n", + " async def before_return_html_hook(page, html):\n", + " print(f\"[Hook] HTML length: {len(html)}\")\n", + " ```" + ] + }, + { + "cell_type": "markdown", + "id": "2d56ebb1", + "metadata": {}, + "source": [ + "#### 8. **Session-Based Crawling**\n", + "\n", + "When to Use Session-Based Crawling: \n", + "Session-based crawling is especially beneficial when navigating through multi-page content where each page load needs to maintain the same session context. For instance, in cases where a “Next Page” button must be clicked to load subsequent data, the new data often replaces the previous content. Here, session-based crawling keeps the browser state intact across each interaction, allowing for sequential actions within the same session.\n", + "\n", + "Example: Multi-Page Navigation Using JavaScript\n", + "In this example, we’ll navigate through multiple pages by clicking a \"Next Page\" button. After each page load, we extract the new content and repeat the process." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7bfebae", + "metadata": {}, + "outputs": [], + "source": [ + "async def multi_page_session_crawl():\n", + " async with AsyncWebCrawler() as crawler:\n", + " session_id = \"page_navigation_session\"\n", + " url = \"https://example.com/paged-content\"\n", + "\n", + " for page_number in range(1, 4):\n", + " result = await crawler.arun(\n", + " url=url,\n", + " session_id=session_id,\n", + " js_code=\"document.querySelector('.next-page-button').click();\" if page_number > 1 else None,\n", + " css_selector=\".content-section\",\n", + " bypass_cache=True\n", + " )\n", + " print(f\"Page {page_number} Content:\")\n", + " print(result.markdown[:500]) # Print first 500 characters\n", + "\n", + "# asyncio.run(multi_page_session_crawl())" + ] + }, + { + "cell_type": "markdown", + "id": "ad32a778", + "metadata": {}, + "source": [ + "#### 9. **Using Extraction Strategies**\n", + "\n", + "**LLM Extraction**\n", + "\n", + "This example demonstrates how to use language model-based extraction to retrieve structured data from a pricing page on OpenAI’s site." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "3011a7c5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--- Extracting Structured Data with openai/gpt-4o-mini ---\n", + "[LOG] 🌤️ Warming up the AsyncWebCrawler\n", + "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n", + "[LOG] 🕸️ Crawling https://openai.com/api/pricing/ using AsyncPlaywrightCrawlerStrategy...\n", + "[LOG] ✅ Crawled https://openai.com/api/pricing/ successfully!\n", + "[LOG] 🚀 Crawling done for https://openai.com/api/pricing/, success: True, time taken: 1.29 seconds\n", + "[LOG] 🚀 Content extracted for https://openai.com/api/pricing/, success: True, time taken: 0.13 seconds\n", + "[LOG] 🔥 Extracting semantic blocks for https://openai.com/api/pricing/, Strategy: AsyncWebCrawler\n", + "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 0\n", + "[LOG] Extracted 26 blocks from URL: https://openai.com/api/pricing/ block index: 0\n", + "[LOG] 🚀 Extraction done for https://openai.com/api/pricing/, time taken: 15.12 seconds.\n", + "[{'model_name': 'gpt-4o', 'input_fee': '$2.50 / 1M input tokens', 'output_fee': '$10.00 / 1M output tokens', 'error': False}, {'model_name': 'gpt-4o-2024-08-06', 'input_fee': '$2.50 / 1M input tokens', 'output_fee': '$10.00 / 1M output tokens', 'error': False}, {'model_name': 'gpt-4o-audio-preview', 'input_fee': '$2.50 / 1M input tokens', 'output_fee': '$10.00 / 1M output tokens', 'error': False}, {'model_name': 'gpt-4o-audio-preview-2024-10-01', 'input_fee': '$2.50 / 1M input tokens', 'output_fee': '$10.00 / 1M output tokens', 'error': False}, {'model_name': 'gpt-4o-2024-05-13', 'input_fee': '$5.00 / 1M input tokens', 'output_fee': '$15.00 / 1M output tokens', 'error': False}]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/unclecode/devs/crawl4ai/venv/lib/python3.10/site-packages/pydantic/main.py:347: UserWarning: Pydantic serializer warnings:\n", + " Expected `PromptTokensDetails` but got `dict` - serialized value may not be as expected\n", + " return self.__pydantic_serializer__.to_python(\n" + ] + } + ], + "source": [ + "from crawl4ai.extraction_strategy import LLMExtractionStrategy\n", + "from pydantic import BaseModel, Field\n", + "import os, json\n", + "\n", + "class OpenAIModelFee(BaseModel):\n", + " model_name: str = Field(..., description=\"Name of the OpenAI model.\")\n", + " input_fee: str = Field(..., description=\"Fee for input token for the OpenAI model.\")\n", + " output_fee: str = Field(\n", + " ..., description=\"Fee for output token for the OpenAI model.\"\n", + " )\n", + "\n", + "async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: dict = None):\n", + " print(f\"\\n--- Extracting Structured Data with {provider} ---\")\n", + " \n", + " # Skip if API token is missing (for providers that require it)\n", + " if api_token is None and provider != \"ollama\":\n", + " print(f\"API token is required for {provider}. Skipping this example.\")\n", + " return\n", + "\n", + " extra_args = {\"extra_headers\": extra_headers} if extra_headers else {}\n", + "\n", + " async with AsyncWebCrawler(verbose=True) as crawler:\n", + " result = await crawler.arun(\n", + " url=\"https://openai.com/api/pricing/\",\n", + " word_count_threshold=1,\n", + " extraction_strategy=LLMExtractionStrategy(\n", + " provider=provider,\n", + " api_token=api_token,\n", + " schema=OpenAIModelFee.schema(),\n", + " extraction_type=\"schema\",\n", + " instruction=\"\"\"Extract all model names along with fees for input and output tokens.\"\n", + " \"{model_name: 'GPT-4', input_fee: 'US$10.00 / 1M tokens', output_fee: 'US$30.00 / 1M tokens'}.\"\"\",\n", + " **extra_args\n", + " ),\n", + " bypass_cache=True,\n", + " )\n", + " print(json.loads(result.extracted_content)[:5])\n", + "\n", + "# Usage:\n", + "await extract_structured_data_using_llm(\"openai/gpt-4o-mini\", os.getenv(\"OPENAI_API_KEY\"))" + ] + }, + { + "cell_type": "markdown", + "id": "6532db9d", + "metadata": {}, + "source": [ + "**Cosine Similarity Strategy**\n", + "\n", + "This strategy uses semantic clustering to extract relevant content based on contextual similarity, which is helpful when extracting related sections from a single topic." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "ec079108", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LOG] Loading Extraction Model for mps device.\n", + "[LOG] Loading Multilabel Classifier for mps device.\n", + "[LOG] Model loaded sentence-transformers/all-MiniLM-L6-v2, models/reuters, took 5.193778038024902 seconds\n", + "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156, success: True, time taken: 1.37 seconds\n", + "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156, success: True, time taken: 0.07 seconds\n", + "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156, Strategy: AsyncWebCrawler\n", + "[LOG] 🚀 Assign tags using mps\n", + "[LOG] 🚀 Categorization done in 0.55 seconds\n", + "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156, time taken: 6.63 seconds.\n", + "[{'index': 1, 'tags': ['news_&_social_concern'], 'content': \"McDonald's 2024 combo: Inflation, a health crisis and a side of politics # McDonald's 2024 combo: Inflation, a health crisis and a side of politics\"}, {'index': 2, 'tags': ['business_&_entrepreneurs', 'news_&_social_concern'], 'content': 'Like many major brands, McDonald’s raked in big profits as the economy reopened from the pandemic. In October 2022, [executives were boasting](https://www.cnbc.com/2022/10/27/mcdonalds-mcd-earnings-q3-2022.html) that they’d been raising prices without crimping traffic, even as competitors began to warn that some customers were closing their wallets after inflation peaked above 9% that summer. Still, the U.S. had repeatedly dodged a much-forecast recession, and [Americans kept spending on nonessentials](https://www.nbcnews.com/business/economy/year-peak-inflation-travel-leisure-mostly-cost-less-rcna92760) like travel and dining out — despite regularly relaying to pollsters their dismal views of an otherwise solid economy. Even so, 64% of consumers said they noticed price increases at quick-service restaurants in September, more than at any other type of venue, according to a survey by Datassential, a food and beverage market researcher. Politicians are still drawing attention to fast-food costs, too, as the election season barrels toward a tumultuous finish. A group of Democratic senators this month [denounced McDonald’s for menu prices](https://www.nbcnews.com/news/us-news/democratic-senators-slam-mcdonalds-menu-price-hikes-rcna176380) that they said outstripped inflation, accusing the company of looking to profit “at the expense of people’s ability to put food on the table.” The financial results come toward the end of a humbling year for the nearly $213 billion restaurant chain, whose shares remained steady on the heels of its latest earnings. Kempczinski [sought to reassure investors](https://www.cnbc.com/2024/10/29/mcdonalds-e-coli-outbreak-ceo-comments.html) that [the E. coli outbreak](https://www.nbcnews.com/health/health-news/illnesses-linked-mcdonalds-e-coli-outbreak-rise-75-cdc-says-rcna177260), linked to Quarter Pounder burgers, was under control after the health crisis temporarily dented the company’s stock and caused U.S. foot traffic to drop nearly 10% in the days afterward, according to estimates by Gordon Haskett financial researchers. The fast-food giant [reported Tuesday](https://www.cnbc.com/2024/10/29/mcdonalds-mcd-earnings-q3-2024.html) that it had reversed its recent U.S. sales drop, posting a 0.3% uptick in the third quarter. Foot traffic was still down slightly, but the company said its summer of discounts was paying off. But by early this year, [photos of eye-watering menu prices](https://x.com/sam_learner/status/1681367351143301129) at some McDonald’s locations — including an $18 Big Mac combo at a Connecticut rest stop from July 2023 — went viral, bringing diners’ long-simmering frustrations to a boiling point that the company couldn’t ignore. On an earnings call in April, Kempczinski acknowledged that foot traffic had fallen. “We will stay laser-focused on providing an unparalleled experience with simple, everyday value and affordability that our consumers can count on as they continue to be mindful about their spending,” CEO Chris Kempczinski [said in a statement](https://www.prnewswire.com/news-releases/mcdonalds-reports-third-quarter-2024-results-302289216.html?Fds-Load-Behavior=force-external) alongside the earnings report.'}, {'index': 3, 'tags': ['food_&_dining', 'news_&_social_concern'], 'content': '![mcdonalds drive-thru economy fast food](https://media-cldnry.s-nbcnews.com/image/upload/t_fit-760w,f_auto,q_auto:best/rockcms/2024-10/241024-los-angeles-mcdonalds-drive-thru-ac-1059p-cfc311.jpg)McDonald’s has had some success leaning into discounts this year. Eric Thayer / Bloomberg via Getty Images file'}, {'index': 4, 'tags': ['business_&_entrepreneurs', 'food_&_dining', 'news_&_social_concern'], 'content': 'McDonald’s has faced a customer revolt over pricey Big Macs, an unsolicited cameo in election-season crossfire, and now an E. coli outbreak — just as the company had been luring customers back with more affordable burgers. Despite a difficult quarter, McDonald’s looks resilient in the face of various pressures, analysts say — something the company shares with U.S. consumers overall. “Consumers continue to be even more discriminating with every dollar that they spend,” he said at the time. Going forward, McDonald’s would be “laser-focused” on affordability. “McDonald’s has also done a good job of embedding the brand in popular culture to enhance its relevance and meaning around fun and family. But it also needed to modify the product line to meet the expectations of a consumer who is on a tight budget,” he said. “The thing that McDonald’s had struggled with, and why I think we’re seeing kind of an inflection point, is a value proposition,” Senatore said. “McDonald’s menu price increases had run ahead of a lot of its restaurant peers. … Consumers are savvy enough to know that.” For many consumers, the fast-food giant’s menus serve as an informal gauge of the economy overall, said Sara Senatore, a Bank of America analyst covering restaurants. “The spotlight is always on McDonald’s because it’s so big” and something of a “bellwether,” she said. McDonald’s didn’t respond to requests for comment.'}, {'index': 5, 'tags': ['business_&_entrepreneurs', 'food_&_dining'], 'content': 'Mickey D’s’ $5 meal deal, which it launched in late June to jumpstart slumping sales, has given the company an appealing price point to advertise nationwide, Senatore said, speculating that it could open the door to a new permanent value offering. But before that promotion rolled out, the company’s reputation as a low-cost option had taken a bruising hit.'}]\n" + ] + } + ], + "source": [ + "from crawl4ai.extraction_strategy import CosineStrategy\n", + "\n", + "async def cosine_similarity_extraction():\n", + " async with AsyncWebCrawler() as crawler:\n", + " strategy = CosineStrategy(\n", + " word_count_threshold=10,\n", + " max_dist=0.2, # Maximum distance between two words\n", + " linkage_method=\"ward\", # Linkage method for hierarchical clustering (ward, complete, average, single)\n", + " top_k=3, # Number of top keywords to extract\n", + " sim_threshold=0.3, # Similarity threshold for clustering\n", + " semantic_filter=\"McDonald's economic impact, American consumer trends\", # Keywords to filter the content semantically using embeddings\n", + " verbose=True\n", + " )\n", + " \n", + " result = await crawler.arun(\n", + " url=\"https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156\",\n", + " extraction_strategy=strategy\n", + " )\n", + " print(json.loads(result.extracted_content)[:5])\n", + "\n", + "asyncio.run(cosine_similarity_extraction())\n" + ] + }, + { + "cell_type": "markdown", + "id": "ff423629", + "metadata": {}, + "source": [ + "#### 10. **Conclusion and Next Steps**\n", + "\n", + "You’ve explored core features of Crawl4AI, including dynamic content handling, link analysis, and advanced extraction strategies. Visit our documentation for further details on using Crawl4AI’s extensive features.\n", + "\n", + "- GitHub Repository: [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)\n", + "- Twitter: [@unclecode](https://twitter.com/unclecode)\n", + "- Website: [https://crawl4ai.com](https://crawl4ai.com)\n", + "\n", + "Happy Crawling with Crawl4AI! 🕷️🤖\n" + ] + }, + { + "cell_type": "markdown", + "id": "d34c1d35", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart_async.config.py new file mode 100644 index 0000000000000000000000000000000000000000..4c4a9d8643becc9af0d734b7cec15ddf1c11c2de --- /dev/null +++ b/docs/examples/quickstart_async.config.py @@ -0,0 +1,610 @@ +import os, sys + +sys.path.append( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +) + +import asyncio +import time +import json +import re +from typing import Dict, List +from bs4 import BeautifulSoup +from pydantic import BaseModel, Field +from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter +from crawl4ai.extraction_strategy import ( + JsonCssExtractionStrategy, + LLMExtractionStrategy, +) + +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +print("Crawl4AI: Advanced Web Crawling and Data Extraction") +print("GitHub Repository: https://github.com/unclecode/crawl4ai") +print("Twitter: @unclecode") +print("Website: https://crawl4ai.com") + + +# Basic Example - Simple Crawl +async def simple_crawl(): + print("\n--- Basic Usage ---") + browser_config = BrowserConfig(headless=True) + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", config=crawler_config + ) + print(result.markdown[:500]) + + +async def clean_content(): + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + excluded_tags=["nav", "footer", "aside"], + remove_overlay_elements=True, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter( + threshold=0.48, threshold_type="fixed", min_word_threshold=0 + ), + options={"ignore_links": True}, + ), + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://en.wikipedia.org/wiki/Apple", + config=crawler_config, + ) + full_markdown_length = len(result.markdown_v2.raw_markdown) + fit_markdown_length = len(result.markdown_v2.fit_markdown) + print(f"Full Markdown Length: {full_markdown_length}") + print(f"Fit Markdown Length: {fit_markdown_length}") + +async def link_analysis(): + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + exclude_external_links=True, + exclude_social_media_links=True, + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + config=crawler_config, + ) + print(f"Found {len(result.links['internal'])} internal links") + print(f"Found {len(result.links['external'])} external links") + + for link in result.links['internal'][:5]: + print(f"Href: {link['href']}\nText: {link['text']}\n") + +# JavaScript Execution Example +async def simple_example_with_running_js_code(): + print("\n--- Executing JavaScript and Using CSS Selectors ---") + + browser_config = BrowserConfig(headless=True, java_script_enabled=True) + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();", + # wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }" + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", config=crawler_config + ) + print(result.markdown[:500]) + + +# CSS Selector Example +async def simple_example_with_css_selector(): + print("\n--- Using CSS Selectors ---") + browser_config = BrowserConfig(headless=True) + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description" + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", config=crawler_config + ) + print(result.markdown[:500]) + +async def media_handling(): + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + config=crawler_config + ) + for img in result.media['images'][:5]: + print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}") + +async def custom_hook_workflow(verbose=True): + async with AsyncWebCrawler() as crawler: + # Set a 'before_goto' hook to run custom code just before navigation + crawler.crawler_strategy.set_hook("before_goto", lambda page, context: print("[Hook] Preparing to navigate...")) + + # Perform the crawl operation + result = await crawler.arun( + url="https://crawl4ai.com" + ) + print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- ")) + + +# Proxy Example +async def use_proxy(): + print("\n--- Using a Proxy ---") + browser_config = BrowserConfig( + headless=True, + proxy_config={ + "server": "http://proxy.example.com:8080", + "username": "username", + "password": "password", + }, + ) + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", config=crawler_config + ) + if result.success: + print(result.markdown[:500]) + + +# Screenshot Example +async def capture_and_save_screenshot(url: str, output_path: str): + browser_config = BrowserConfig(headless=True) + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=url, config=crawler_config) + + if result.success and result.screenshot: + import base64 + + screenshot_data = base64.b64decode(result.screenshot) + with open(output_path, "wb") as f: + f.write(screenshot_data) + print(f"Screenshot saved successfully to {output_path}") + else: + print("Failed to capture screenshot") + + +# LLM Extraction Example +class OpenAIModelFee(BaseModel): + model_name: str = Field(..., description="Name of the OpenAI model.") + input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") + output_fee: str = Field( + ..., description="Fee for output token for the OpenAI model." + ) + + +async def extract_structured_data_using_llm( + provider: str, api_token: str = None, extra_headers: Dict[str, str] = None +): + print(f"\n--- Extracting Structured Data with {provider} ---") + + if api_token is None and provider != "ollama": + print(f"API token is required for {provider}. Skipping this example.") + return + + browser_config = BrowserConfig(headless=True) + + extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000} + if extra_headers: + extra_args["extra_headers"] = extra_headers + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + word_count_threshold=1, + page_timeout=80000, + extraction_strategy=LLMExtractionStrategy( + provider=provider, + api_token=api_token, + schema=OpenAIModelFee.model_json_schema(), + extraction_type="schema", + instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. + Do not miss any models in the entire content.""", + extra_args=extra_args, + ), + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://openai.com/api/pricing/", config=crawler_config + ) + print(result.extracted_content) + + +# CSS Extraction Example +async def extract_structured_data_using_css_extractor(): + print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") + schema = { + "name": "KidoCode Courses", + "baseSelector": "section.charge-methodology .w-tab-content > div", + "fields": [ + { + "name": "section_title", + "selector": "h3.heading-50", + "type": "text", + }, + { + "name": "section_description", + "selector": ".charge-content", + "type": "text", + }, + { + "name": "course_name", + "selector": ".text-block-93", + "type": "text", + }, + { + "name": "course_description", + "selector": ".course-content-text", + "type": "text", + }, + { + "name": "course_icon", + "selector": ".image-92", + "type": "attribute", + "attribute": "src", + }, + ], + } + + browser_config = BrowserConfig(headless=True, java_script_enabled=True) + + js_click_tabs = """ + (async () => { + const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); + for(let tab of tabs) { + tab.scrollIntoView(); + tab.click(); + await new Promise(r => setTimeout(r, 500)); + } + })(); + """ + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + js_code=[js_click_tabs], + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.kidocode.com/degrees/technology", config=crawler_config + ) + + companies = json.loads(result.extracted_content) + print(f"Successfully extracted {len(companies)} companies") + print(json.dumps(companies[0], indent=2)) + + +# Dynamic Content Examples - Method 1 +async def crawl_dynamic_content_pages_method_1(): + print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") + first_commit = "" + + async def on_execution_started(page, **kwargs): + nonlocal first_commit + try: + while True: + await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4") + commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4") + commit = await commit.evaluate("(element) => element.textContent") + commit = re.sub(r"\s+", "", commit) + if commit and commit != first_commit: + first_commit = commit + break + await asyncio.sleep(0.5) + except Exception as e: + print(f"Warning: New content didn't appear after JavaScript execution: {e}") + + browser_config = BrowserConfig(headless=False, java_script_enabled=True) + + async with AsyncWebCrawler(config=browser_config) as crawler: + crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) + + url = "https://github.com/microsoft/TypeScript/commits/main" + session_id = "typescript_commits_session" + all_commits = [] + + js_next_page = """ + const button = document.querySelector('a[data-testid="pagination-next-button"]'); + if (button) button.click(); + """ + + for page in range(3): + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + css_selector="li.Box-sc-g0xbh4-0", + js_code=js_next_page if page > 0 else None, + js_only=page > 0, + session_id=session_id, + ) + + result = await crawler.arun(url=url, config=crawler_config) + assert result.success, f"Failed to crawl page {page + 1}" + + soup = BeautifulSoup(result.cleaned_html, "html.parser") + commits = soup.select("li") + all_commits.extend(commits) + + print(f"Page {page + 1}: Found {len(commits)} commits") + + print(f"Successfully crawled {len(all_commits)} commits across 3 pages") + + +# Dynamic Content Examples - Method 2 +async def crawl_dynamic_content_pages_method_2(): + print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") + + browser_config = BrowserConfig(headless=False, java_script_enabled=True) + + js_next_page_and_wait = """ + (async () => { + const getCurrentCommit = () => { + const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); + return commits.length > 0 ? commits[0].textContent.trim() : null; + }; + + const initialCommit = getCurrentCommit(); + const button = document.querySelector('a[data-testid="pagination-next-button"]'); + if (button) button.click(); + + while (true) { + await new Promise(resolve => setTimeout(resolve, 100)); + const newCommit = getCurrentCommit(); + if (newCommit && newCommit !== initialCommit) { + break; + } + } + })(); + """ + + schema = { + "name": "Commit Extractor", + "baseSelector": "li.Box-sc-g0xbh4-0", + "fields": [ + { + "name": "title", + "selector": "h4.markdown-title", + "type": "text", + "transform": "strip", + }, + ], + } + + async with AsyncWebCrawler(config=browser_config) as crawler: + url = "https://github.com/microsoft/TypeScript/commits/main" + session_id = "typescript_commits_session" + all_commits = [] + + extraction_strategy = JsonCssExtractionStrategy(schema) + + for page in range(3): + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + css_selector="li.Box-sc-g0xbh4-0", + extraction_strategy=extraction_strategy, + js_code=js_next_page_and_wait if page > 0 else None, + js_only=page > 0, + session_id=session_id, + ) + + result = await crawler.arun(url=url, config=crawler_config) + assert result.success, f"Failed to crawl page {page + 1}" + + commits = json.loads(result.extracted_content) + all_commits.extend(commits) + print(f"Page {page + 1}: Found {len(commits)} commits") + + print(f"Successfully crawled {len(all_commits)} commits across 3 pages") + + +async def cosine_similarity_extraction(): + crawl_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=CosineStrategy( + word_count_threshold=10, + max_dist=0.2, # Maximum distance between two words + linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single) + top_k=3, # Number of top keywords to extract + sim_threshold=0.3, # Similarity threshold for clustering + semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings + verbose=True + ), + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156", + config=crawl_config + ) + print(json.loads(result.extracted_content)[:5]) + +# Browser Comparison +async def crawl_custom_browser_type(): + print("\n--- Browser Comparison ---") + + # Firefox + browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True) + start = time.time() + async with AsyncWebCrawler(config=browser_config_firefox) as crawler: + result = await crawler.arun( + url="https://www.example.com", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), + ) + print("Firefox:", time.time() - start) + print(result.markdown[:500]) + + # WebKit + browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True) + start = time.time() + async with AsyncWebCrawler(config=browser_config_webkit) as crawler: + result = await crawler.arun( + url="https://www.example.com", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), + ) + print("WebKit:", time.time() - start) + print(result.markdown[:500]) + + # Chromium (default) + browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True) + start = time.time() + async with AsyncWebCrawler(config=browser_config_chromium) as crawler: + result = await crawler.arun( + url="https://www.example.com", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), + ) + print("Chromium:", time.time() - start) + print(result.markdown[:500]) + + +# Anti-Bot and User Simulation +async def crawl_with_user_simulation(): + browser_config = BrowserConfig( + headless=True, + user_agent_mode="random", + user_agent_generator_config={"device_type": "mobile", "os_type": "android"}, + ) + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + magic=True, + simulate_user=True, + override_navigator=True, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config) + print(result.markdown) + +async def ssl_certification(): + # Configure crawler to fetch SSL certificate + config = CrawlerRunConfig( + fetch_ssl_certificate=True, + cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url='https://example.com', + config=config + ) + + if result.success and result.ssl_certificate: + cert = result.ssl_certificate + + # 1. Access certificate properties directly + print("\nCertificate Information:") + print(f"Issuer: {cert.issuer.get('CN', '')}") + print(f"Valid until: {cert.valid_until}") + print(f"Fingerprint: {cert.fingerprint}") + + # 2. Export certificate in different formats + cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis + print("\nCertificate exported to:") + print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}") + + pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers + print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}") + + der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps + print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}") + +# Speed Comparison +async def speed_comparison(): + print("\n--- Speed Comparison ---") + + # Firecrawl comparison + from firecrawl import FirecrawlApp + + app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"]) + start = time.time() + scrape_status = app.scrape_url( + "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]} + ) + end = time.time() + print("Firecrawl:") + print(f"Time taken: {end - start:.2f} seconds") + print(f"Content length: {len(scrape_status['markdown'])} characters") + print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}") + print() + + # Crawl4AI comparisons + browser_config = BrowserConfig(headless=True) + + # Simple crawl + async with AsyncWebCrawler(config=browser_config) as crawler: + start = time.time() + result = await crawler.arun( + url="https://www.nbcnews.com/business", + config=CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, word_count_threshold=0 + ), + ) + end = time.time() + print("Crawl4AI (simple crawl):") + print(f"Time taken: {end - start:.2f} seconds") + print(f"Content length: {len(result.markdown)} characters") + print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") + print() + + # Advanced filtering + start = time.time() + result = await crawler.arun( + url="https://www.nbcnews.com/business", + config=CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + word_count_threshold=0, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter( + threshold=0.48, threshold_type="fixed", min_word_threshold=0 + ) + ), + ), + ) + end = time.time() + print("Crawl4AI (Markdown Plus):") + print(f"Time taken: {end - start:.2f} seconds") + print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters") + print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters") + print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") + print() + + +# Main execution +async def main(): + # Basic examples + # await simple_crawl() + # await simple_example_with_running_js_code() + # await simple_example_with_css_selector() + + # Advanced examples + # await extract_structured_data_using_css_extractor() + await extract_structured_data_using_llm( + "openai/gpt-4o", os.getenv("OPENAI_API_KEY") + ) + # await crawl_dynamic_content_pages_method_1() + # await crawl_dynamic_content_pages_method_2() + + # Browser comparisons + # await crawl_custom_browser_type() + + # Performance testing + # await speed_comparison() + + # Screenshot example + # await capture_and_save_screenshot( + # "https://www.example.com", + # os.path.join(__location__, "tmp/example_screenshot.jpg") + # ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py new file mode 100644 index 0000000000000000000000000000000000000000..e640e6bd2843ecd7aa650eb97ce29e8410961ada --- /dev/null +++ b/docs/examples/quickstart_async.py @@ -0,0 +1,640 @@ +import os, sys +# append parent directory to system path +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))); os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692"; + +import asyncio +# import nest_asyncio +# nest_asyncio.apply() + +import time +import json +import os +import re +from typing import Dict, List +from bs4 import BeautifulSoup +from pydantic import BaseModel, Field +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter +from crawl4ai.extraction_strategy import ( + JsonCssExtractionStrategy, + LLMExtractionStrategy, +) + +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +print("Crawl4AI: Advanced Web Crawling and Data Extraction") +print("GitHub Repository: https://github.com/unclecode/crawl4ai") +print("Twitter: @unclecode") +print("Website: https://crawl4ai.com") + + +async def simple_crawl(): + print("\n--- Basic Usage ---") + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS) + print(result.markdown[:500]) # Print first 500 characters + +async def simple_example_with_running_js_code(): + print("\n--- Executing JavaScript and Using CSS Selectors ---") + # New code to handle the wait_for parameter + wait_for = """() => { + return Array.from(document.querySelectorAll('article.tease-card')).length > 10; + }""" + + # wait_for can be also just a css selector + # wait_for = "article.tease-card:nth-child(10)" + + async with AsyncWebCrawler(verbose=True) as crawler: + js_code = [ + "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" + ] + result = await crawler.arun( + url="https://www.nbcnews.com/business", + js_code=js_code, + # wait_for=wait_for, + cache_mode=CacheMode.BYPASS, + ) + print(result.markdown[:500]) # Print first 500 characters + +async def simple_example_with_css_selector(): + print("\n--- Using CSS Selectors ---") + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + css_selector=".wide-tease-item__description", + cache_mode=CacheMode.BYPASS, + ) + print(result.markdown[:500]) # Print first 500 characters + +async def use_proxy(): + print("\n--- Using a Proxy ---") + print( + "Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example." + ) + # Uncomment and modify the following lines to use a proxy + async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + cache_mode= CacheMode.BYPASS + ) + if result.success: + print(result.markdown[:500]) # Print first 500 characters + +async def capture_and_save_screenshot(url: str, output_path: str): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url=url, + screenshot=True, + cache_mode= CacheMode.BYPASS + ) + + if result.success and result.screenshot: + import base64 + + # Decode the base64 screenshot data + screenshot_data = base64.b64decode(result.screenshot) + + # Save the screenshot as a JPEG file + with open(output_path, 'wb') as f: + f.write(screenshot_data) + + print(f"Screenshot saved successfully to {output_path}") + else: + print("Failed to capture screenshot") + +class OpenAIModelFee(BaseModel): + model_name: str = Field(..., description="Name of the OpenAI model.") + input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") + output_fee: str = Field( + ..., description="Fee for output token for the OpenAI model." + ) + +async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None): + print(f"\n--- Extracting Structured Data with {provider} ---") + + if api_token is None and provider != "ollama": + print(f"API token is required for {provider}. Skipping this example.") + return + + # extra_args = {} + extra_args={ + "temperature": 0, + "top_p": 0.9, + "max_tokens": 2000, + # any other supported parameters for litellm + } + if extra_headers: + extra_args["extra_headers"] = extra_headers + + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://openai.com/api/pricing/", + word_count_threshold=1, + extraction_strategy=LLMExtractionStrategy( + provider=provider, + api_token=api_token, + schema=OpenAIModelFee.model_json_schema(), + extraction_type="schema", + instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. + Do not miss any models in the entire content. One extracted model JSON format should look like this: + {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""", + extra_args=extra_args + ), + cache_mode=CacheMode.BYPASS, + ) + print(result.extracted_content) + +async def extract_structured_data_using_css_extractor(): + print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") + schema = { + "name": "KidoCode Courses", + "baseSelector": "section.charge-methodology .w-tab-content > div", + "fields": [ + { + "name": "section_title", + "selector": "h3.heading-50", + "type": "text", + }, + { + "name": "section_description", + "selector": ".charge-content", + "type": "text", + }, + { + "name": "course_name", + "selector": ".text-block-93", + "type": "text", + }, + { + "name": "course_description", + "selector": ".course-content-text", + "type": "text", + }, + { + "name": "course_icon", + "selector": ".image-92", + "type": "attribute", + "attribute": "src" + } + ] +} + + async with AsyncWebCrawler( + headless=True, + verbose=True + ) as crawler: + + # Create the JavaScript that handles clicking multiple times + js_click_tabs = """ + (async () => { + const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); + + for(let tab of tabs) { + // scroll to the tab + tab.scrollIntoView(); + tab.click(); + // Wait for content to load and animations to complete + await new Promise(r => setTimeout(r, 500)); + } + })(); + """ + + result = await crawler.arun( + url="https://www.kidocode.com/degrees/technology", + extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True), + js_code=[js_click_tabs], + cache_mode=CacheMode.BYPASS + ) + + companies = json.loads(result.extracted_content) + print(f"Successfully extracted {len(companies)} companies") + print(json.dumps(companies[0], indent=2)) + +# Advanced Session-Based Crawling with Dynamic Content 🔄 +async def crawl_dynamic_content_pages_method_1(): + print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") + first_commit = "" + + async def on_execution_started(page): + nonlocal first_commit + try: + while True: + await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4") + commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4") + commit = await commit.evaluate("(element) => element.textContent") + commit = re.sub(r"\s+", "", commit) + if commit and commit != first_commit: + first_commit = commit + break + await asyncio.sleep(0.5) + except Exception as e: + print(f"Warning: New content didn't appear after JavaScript execution: {e}") + + async with AsyncWebCrawler(verbose=True) as crawler: + crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) + + url = "https://github.com/microsoft/TypeScript/commits/main" + session_id = "typescript_commits_session" + all_commits = [] + + js_next_page = """ + (() => { + const button = document.querySelector('a[data-testid="pagination-next-button"]'); + if (button) button.click(); + })(); + """ + + for page in range(3): # Crawl 3 pages + result = await crawler.arun( + url=url, + session_id=session_id, + css_selector="li.Box-sc-g0xbh4-0", + js=js_next_page if page > 0 else None, + cache_mode=CacheMode.BYPASS, + js_only=page > 0, + headless=False, + ) + + assert result.success, f"Failed to crawl page {page + 1}" + + soup = BeautifulSoup(result.cleaned_html, "html.parser") + commits = soup.select("li") + all_commits.extend(commits) + + print(f"Page {page + 1}: Found {len(commits)} commits") + + await crawler.crawler_strategy.kill_session(session_id) + print(f"Successfully crawled {len(all_commits)} commits across 3 pages") + +async def crawl_dynamic_content_pages_method_2(): + print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") + + async with AsyncWebCrawler(verbose=True) as crawler: + url = "https://github.com/microsoft/TypeScript/commits/main" + session_id = "typescript_commits_session" + all_commits = [] + last_commit = "" + + js_next_page_and_wait = """ + (async () => { + const getCurrentCommit = () => { + const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); + return commits.length > 0 ? commits[0].textContent.trim() : null; + }; + + const initialCommit = getCurrentCommit(); + const button = document.querySelector('a[data-testid="pagination-next-button"]'); + if (button) button.click(); + + // Poll for changes + while (true) { + await new Promise(resolve => setTimeout(resolve, 100)); // Wait 100ms + const newCommit = getCurrentCommit(); + if (newCommit && newCommit !== initialCommit) { + break; + } + } + })(); + """ + + schema = { + "name": "Commit Extractor", + "baseSelector": "li.Box-sc-g0xbh4-0", + "fields": [ + { + "name": "title", + "selector": "h4.markdown-title", + "type": "text", + "transform": "strip", + }, + ], + } + extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) + + for page in range(3): # Crawl 3 pages + result = await crawler.arun( + url=url, + session_id=session_id, + css_selector="li.Box-sc-g0xbh4-0", + extraction_strategy=extraction_strategy, + js_code=js_next_page_and_wait if page > 0 else None, + js_only=page > 0, + cache_mode=CacheMode.BYPASS, + headless=False, + ) + + assert result.success, f"Failed to crawl page {page + 1}" + + commits = json.loads(result.extracted_content) + all_commits.extend(commits) + + print(f"Page {page + 1}: Found {len(commits)} commits") + + await crawler.crawler_strategy.kill_session(session_id) + print(f"Successfully crawled {len(all_commits)} commits across 3 pages") + +async def crawl_dynamic_content_pages_method_3(): + print("\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---") + + async with AsyncWebCrawler(verbose=True) as crawler: + url = "https://github.com/microsoft/TypeScript/commits/main" + session_id = "typescript_commits_session" + all_commits = [] + + js_next_page = """ + const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); + if (commits.length > 0) { + window.firstCommit = commits[0].textContent.trim(); + } + const button = document.querySelector('a[data-testid="pagination-next-button"]'); + if (button) button.click(); + """ + + wait_for = """() => { + const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); + if (commits.length === 0) return false; + const firstCommit = commits[0].textContent.trim(); + return firstCommit !== window.firstCommit; + }""" + + schema = { + "name": "Commit Extractor", + "baseSelector": "li.Box-sc-g0xbh4-0", + "fields": [ + { + "name": "title", + "selector": "h4.markdown-title", + "type": "text", + "transform": "strip", + }, + ], + } + extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) + + for page in range(3): # Crawl 3 pages + result = await crawler.arun( + url=url, + session_id=session_id, + css_selector="li.Box-sc-g0xbh4-0", + extraction_strategy=extraction_strategy, + js_code=js_next_page if page > 0 else None, + wait_for=wait_for if page > 0 else None, + js_only=page > 0, + cache_mode=CacheMode.BYPASS, + headless=False, + ) + + assert result.success, f"Failed to crawl page {page + 1}" + + commits = json.loads(result.extracted_content) + all_commits.extend(commits) + + print(f"Page {page + 1}: Found {len(commits)} commits") + + await crawler.crawler_strategy.kill_session(session_id) + print(f"Successfully crawled {len(all_commits)} commits across 3 pages") + +async def crawl_custom_browser_type(): + # Use Firefox + start = time.time() + async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler: + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) + print(result.markdown[:500]) + print("Time taken: ", time.time() - start) + + # Use WebKit + start = time.time() + async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler: + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) + print(result.markdown[:500]) + print("Time taken: ", time.time() - start) + + # Use Chromium (default) + start = time.time() + async with AsyncWebCrawler(verbose=True, headless = True) as crawler: + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) + print(result.markdown[:500]) + print("Time taken: ", time.time() - start) + +async def crawl_with_user_simultion(): + async with AsyncWebCrawler(verbose=True, headless=True) as crawler: + url = "YOUR-URL-HERE" + result = await crawler.arun( + url=url, + cache_mode=CacheMode.BYPASS, + magic = True, # Automatically detects and removes overlays, popups, and other elements that block content + # simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction + # override_navigator = True # Overrides the navigator object to make it look like a real user + ) + + print(result.markdown) + +async def speed_comparison(): + # print("\n--- Speed Comparison ---") + # print("Firecrawl (simulated):") + # print("Time taken: 7.02 seconds") + # print("Content length: 42074 characters") + # print("Images found: 49") + # print() + # Simulated Firecrawl performance + from firecrawl import FirecrawlApp + app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY']) + start = time.time() + scrape_status = app.scrape_url( + 'https://www.nbcnews.com/business', + params={'formats': ['markdown', 'html']} + ) + end = time.time() + print("Firecrawl:") + print(f"Time taken: {end - start:.2f} seconds") + print(f"Content length: {len(scrape_status['markdown'])} characters") + print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}") + print() + + async with AsyncWebCrawler() as crawler: + # Crawl4AI simple crawl + start = time.time() + result = await crawler.arun( + url="https://www.nbcnews.com/business", + word_count_threshold=0, + cache_mode=CacheMode.BYPASS, + verbose=False, + ) + end = time.time() + print("Crawl4AI (simple crawl):") + print(f"Time taken: {end - start:.2f} seconds") + print(f"Content length: {len(result.markdown)} characters") + print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") + print() + + # Crawl4AI with advanced content filtering + start = time.time() + result = await crawler.arun( + url="https://www.nbcnews.com/business", + word_count_threshold=0, + markdown_generator=DefaultMarkdownGenerator( + content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0) + # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) + ), + cache_mode=CacheMode.BYPASS, + verbose=False, + ) + end = time.time() + print("Crawl4AI (Markdown Plus):") + print(f"Time taken: {end - start:.2f} seconds") + print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters") + print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters") + print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") + print() + + # Crawl4AI with JavaScript execution + start = time.time() + result = await crawler.arun( + url="https://www.nbcnews.com/business", + js_code=[ + "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" + ], + word_count_threshold=0, + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator( + content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0) + # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0) + ), + verbose=False, + ) + end = time.time() + print("Crawl4AI (with JavaScript execution):") + print(f"Time taken: {end - start:.2f} seconds") + print(f"Content length: {len(result.markdown)} characters") + print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters") + print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") + + print("\nNote on Speed Comparison:") + print("The speed test conducted here may not reflect optimal conditions.") + print("When we call Firecrawl's API, we're seeing its best performance,") + print("while Crawl4AI's performance is limited by the local network speed.") + print("For a more accurate comparison, it's recommended to run these tests") + print("on servers with a stable and fast internet connection.") + print("Despite these limitations, Crawl4AI still demonstrates faster performance.") + print("If you run these tests in an environment with better network conditions,") + print("you may observe an even more significant speed advantage for Crawl4AI.") + +async def generate_knowledge_graph(): + class Entity(BaseModel): + name: str + description: str + + class Relationship(BaseModel): + entity1: Entity + entity2: Entity + description: str + relation_type: str + + class KnowledgeGraph(BaseModel): + entities: List[Entity] + relationships: List[Relationship] + + extraction_strategy = LLMExtractionStrategy( + provider='openai/gpt-4o-mini', # Or any other provider, including Ollama and open source models + api_token=os.getenv('OPENAI_API_KEY'), # In case of Ollama just pass "no-token" + schema=KnowledgeGraph.model_json_schema(), + extraction_type="schema", + instruction="""Extract entities and relationships from the given text.""" + ) + async with AsyncWebCrawler() as crawler: + url = "https://paulgraham.com/love.html" + result = await crawler.arun( + url=url, + cache_mode=CacheMode.BYPASS, + extraction_strategy=extraction_strategy, + # magic=True + ) + # print(result.extracted_content) + with open(os.path.join(__location__, "kb.json"), "w") as f: + f.write(result.extracted_content) + +async def fit_markdown_remove_overlay(): + + async with AsyncWebCrawler( + headless=True, # Set to False to see what is happening + verbose=True, + user_agent_mode="random", + user_agent_generator_config={ + "device_type": "mobile", + "os_type": "android" + }, + ) as crawler: + result = await crawler.arun( + url='https://www.kidocode.com/degrees/technology', + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter( + threshold=0.48, threshold_type="fixed", min_word_threshold=0 + ), + options={ + "ignore_links": True + } + ), + # markdown_generator=DefaultMarkdownGenerator( + # content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0), + # options={ + # "ignore_links": True + # } + # ), + ) + + if result.success: + print(len(result.markdown_v2.raw_markdown)) + print(len(result.markdown_v2.markdown_with_citations)) + print(len(result.markdown_v2.fit_markdown)) + + # Save clean html + with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f: + f.write(result.cleaned_html) + + with open(os.path.join(__location__, "output/output_raw_markdown.md"), "w") as f: + f.write(result.markdown_v2.raw_markdown) + + with open(os.path.join(__location__, "output/output_markdown_with_citations.md"), "w") as f: + f.write(result.markdown_v2.markdown_with_citations) + + with open(os.path.join(__location__, "output/output_fit_markdown.md"), "w") as f: + f.write(result.markdown_v2.fit_markdown) + + print("Done") + + +async def main(): + # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + + # await simple_crawl() + # await simple_example_with_running_js_code() + # await simple_example_with_css_selector() + # # await use_proxy() + # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) + # await extract_structured_data_using_css_extractor() + + # LLM extraction examples + # await extract_structured_data_using_llm() + # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) + # await extract_structured_data_using_llm("ollama/llama3.2") + + # You always can pass custom headers to the extraction strategy + # custom_headers = { + # "Authorization": "Bearer your-custom-token", + # "X-Custom-Header": "Some-Value" + # } + # await extract_structured_data_using_llm(extra_headers=custom_headers) + + # await crawl_dynamic_content_pages_method_1() + # await crawl_dynamic_content_pages_method_2() + await crawl_dynamic_content_pages_method_3() + + # await crawl_custom_browser_type() + + # await speed_comparison() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/quickstart_sync.py b/docs/examples/quickstart_sync.py new file mode 100644 index 0000000000000000000000000000000000000000..89c631397595851b4e923e3e15e962804c83af42 --- /dev/null +++ b/docs/examples/quickstart_sync.py @@ -0,0 +1,312 @@ +import os +import time +from crawl4ai.web_crawler import WebCrawler +from crawl4ai.chunking_strategy import * +from crawl4ai.extraction_strategy import * +from crawl4ai.crawler_strategy import * +from rich import print +from rich.console import Console +from functools import lru_cache + +console = Console() + +@lru_cache() +def create_crawler(): + crawler = WebCrawler(verbose=True) + crawler.warmup() + return crawler + +def print_result(result): + # Print each key in one line and just the first 10 characters of each one's value and three dots + console.print(f"\t[bold]Result:[/bold]") + for key, value in result.model_dump().items(): + if isinstance(value, str) and value: + console.print(f"\t{key}: [green]{value[:20]}...[/green]") + if result.extracted_content: + items = json.loads(result.extracted_content) + print(f"\t[bold]{len(items)} blocks is extracted![/bold]") + + +def cprint(message, press_any_key=False): + console.print(message) + if press_any_key: + console.print("Press any key to continue...", style="") + input() + +def basic_usage(crawler): + cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]") + result = crawler.run(url="https://www.nbcnews.com/business", only_text = True) + cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]") + print_result(result) + +def basic_usage_some_params(crawler): + cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]") + result = crawler.run(url="https://www.nbcnews.com/business", word_count_threshold=1, only_text = True) + cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]") + print_result(result) + +def screenshot_usage(crawler): + cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]") + result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True) + cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]") + # Save the screenshot to a file + with open("screenshot.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) + cprint("Screenshot saved to 'screenshot.png'!") + print_result(result) + +def understanding_parameters(crawler): + cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]") + cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.") + + # First crawl (reads from cache) + cprint("1️⃣ First crawl (caches the result):", True) + start_time = time.time() + result = crawler.run(url="https://www.nbcnews.com/business") + end_time = time.time() + cprint(f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]") + print_result(result) + + # Force to crawl again + cprint("2️⃣ Second crawl (Force to crawl again):", True) + start_time = time.time() + result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True) + end_time = time.time() + cprint(f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]") + print_result(result) + +def add_chunking_strategy(crawler): + # Adding a chunking strategy: RegexChunking + cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True) + cprint("RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!") + result = crawler.run( + url="https://www.nbcnews.com/business", + chunking_strategy=RegexChunking(patterns=["\n\n"]) + ) + cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]") + print_result(result) + + # Adding another chunking strategy: NlpSentenceChunking + cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True) + cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!") + result = crawler.run( + url="https://www.nbcnews.com/business", + chunking_strategy=NlpSentenceChunking() + ) + cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]") + print_result(result) + +def add_extraction_strategy(crawler): + # Adding an extraction strategy: CosineStrategy + cprint("\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]", True) + cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!") + result = crawler.run( + url="https://www.nbcnews.com/business", + extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold = 0.3, verbose=True) + ) + cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]") + print_result(result) + + # Using semantic_filter with CosineStrategy + cprint("You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!") + result = crawler.run( + url="https://www.nbcnews.com/business", + extraction_strategy=CosineStrategy( + semantic_filter="inflation rent prices", + ) + ) + cprint("[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]") + print_result(result) + +def add_llm_extraction_strategy(crawler): + # Adding an LLM extraction strategy without instructions + cprint("\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]", True) + cprint("LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!") + result = crawler.run( + url="https://www.nbcnews.com/business", + extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')) + ) + cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]") + print_result(result) + + # Adding an LLM extraction strategy with instructions + cprint("\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]", True) + cprint("Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!") + result = crawler.run( + url="https://www.nbcnews.com/business", + extraction_strategy=LLMExtractionStrategy( + provider="openai/gpt-4o", + api_token=os.getenv('OPENAI_API_KEY'), + instruction="I am interested in only financial news" + ) + ) + cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]") + print_result(result) + + result = crawler.run( + url="https://www.nbcnews.com/business", + extraction_strategy=LLMExtractionStrategy( + provider="openai/gpt-4o", + api_token=os.getenv('OPENAI_API_KEY'), + instruction="Extract only content related to technology" + ) + ) + cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]") + print_result(result) + +def targeted_extraction(crawler): + # Using a CSS selector to extract only H2 tags + cprint("\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]", True) + result = crawler.run( + url="https://www.nbcnews.com/business", + css_selector="h2" + ) + cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]") + print_result(result) + +def interactive_extraction(crawler): + # Passing JavaScript code to interact with the page + cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True) + cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.") + js_code = """ + const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); + loadMoreButton && loadMoreButton.click(); + """ + # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) + # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) + result = crawler.run( + url="https://www.nbcnews.com/business", + js = js_code + ) + cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") + print_result(result) + +def multiple_scrip(crawler): + # Passing JavaScript code to interact with the page + cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True) + cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.") + js_code = [""" + const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); + loadMoreButton && loadMoreButton.click(); + """] * 2 + # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) + # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) + result = crawler.run( + url="https://www.nbcnews.com/business", + js = js_code + ) + cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") + print_result(result) + +def using_crawler_hooks(crawler): + # Example usage of the hooks for authentication and setting a cookie + def on_driver_created(driver): + print("[HOOK] on_driver_created") + # Example customization: maximize the window + driver.maximize_window() + + # Example customization: logging in to a hypothetical website + driver.get('https://example.com/login') + + from selenium.webdriver.support.ui import WebDriverWait + from selenium.webdriver.common.by import By + from selenium.webdriver.support import expected_conditions as EC + + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.NAME, 'username')) + ) + driver.find_element(By.NAME, 'username').send_keys('testuser') + driver.find_element(By.NAME, 'password').send_keys('password123') + driver.find_element(By.NAME, 'login').click() + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.ID, 'welcome')) + ) + # Add a custom cookie + driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'}) + return driver + + + def before_get_url(driver): + print("[HOOK] before_get_url") + # Example customization: add a custom header + # Enable Network domain for sending headers + driver.execute_cdp_cmd('Network.enable', {}) + # Add a custom header + driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}}) + return driver + + def after_get_url(driver): + print("[HOOK] after_get_url") + # Example customization: log the URL + print(driver.current_url) + return driver + + def before_return_html(driver, html): + print("[HOOK] before_return_html") + # Example customization: log the HTML + print(len(html)) + return driver + + cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True) + + crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True) + crawler_strategy.set_hook('on_driver_created', on_driver_created) + crawler_strategy.set_hook('before_get_url', before_get_url) + crawler_strategy.set_hook('after_get_url', after_get_url) + crawler_strategy.set_hook('before_return_html', before_return_html) + + crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy) + crawler.warmup() + result = crawler.run(url="https://example.com") + + cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]") + print_result(result= result) + +def using_crawler_hooks_dleay_example(crawler): + def delay(driver): + print("Delaying for 5 seconds...") + time.sleep(5) + print("Resuming...") + + def create_crawler(): + crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True) + crawler_strategy.set_hook('after_get_url', delay) + crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy) + crawler.warmup() + return crawler + + cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]") + crawler = create_crawler() + result = crawler.run(url="https://google.com", bypass_cache=True) + + cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]") + print_result(result) + + + +def main(): + cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]") + cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]") + cprint("If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files.") + + crawler = create_crawler() + + crawler.always_by_pass_cache = True + basic_usage(crawler) + # basic_usage_some_params(crawler) + understanding_parameters(crawler) + + crawler.always_by_pass_cache = True + screenshot_usage(crawler) + add_chunking_strategy(crawler) + add_extraction_strategy(crawler) + add_llm_extraction_strategy(crawler) + targeted_extraction(crawler) + interactive_extraction(crawler) + multiple_scrip(crawler) + + cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]") + +if __name__ == "__main__": + main() + diff --git a/docs/examples/quickstart_v0.ipynb b/docs/examples/quickstart_v0.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..71f23acb7d921a8bafe3f3126088a8d043de87e9 --- /dev/null +++ b/docs/examples/quickstart_v0.ipynb @@ -0,0 +1,735 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "6yLvrXn7yZQI" + }, + "source": [ + "# Crawl4AI: Advanced Web Crawling and Data Extraction\n", + "\n", + "Welcome to this interactive notebook showcasing Crawl4AI, an advanced asynchronous web crawling and data extraction library.\n", + "\n", + "- GitHub Repository: [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)\n", + "- Twitter: [@unclecode](https://twitter.com/unclecode)\n", + "- Website: [https://crawl4ai.com](https://crawl4ai.com)\n", + "\n", + "Let's explore the powerful features of Crawl4AI!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KIn_9nxFyZQK" + }, + "source": [ + "## Installation\n", + "\n", + "First, let's install Crawl4AI from GitHub:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mSnaxLf3zMog" + }, + "outputs": [], + "source": [ + "!sudo apt-get update && sudo apt-get install -y libwoff1 libopus0 libwebp6 libwebpdemux2 libenchant1c2a libgudev-1.0-0 libsecret-1-0 libhyphen0 libgdk-pixbuf2.0-0 libegl1 libnotify4 libxslt1.1 libevent-2.1-7 libgles2 libvpx6 libxcomposite1 libatk1.0-0 libatk-bridge2.0-0 libepoxy0 libgtk-3-0 libharfbuzz-icu0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xlXqaRtayZQK" + }, + "outputs": [], + "source": [ + "!pip install crawl4ai\n", + "!pip install nest-asyncio\n", + "!playwright install" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qKCE7TI7yZQL" + }, + "source": [ + "Now, let's import the necessary libraries:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "I67tr7aAyZQL" + }, + "outputs": [], + "source": [ + "import asyncio\n", + "import nest_asyncio\n", + "from crawl4ai import AsyncWebCrawler\n", + "from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy\n", + "import json\n", + "import time\n", + "from pydantic import BaseModel, Field\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "h7yR_Rt_yZQM" + }, + "source": [ + "## Basic Usage\n", + "\n", + "Let's start with a simple crawl example:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yBh6hf4WyZQM", + "outputId": "0f83af5c-abba-4175-ed95-70b7512e6bcc" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LOG] 🌤️ Warming up the AsyncWebCrawler\n", + "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n", + "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.05 seconds\n", + "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.05 seconds.\n", + "18102\n" + ] + } + ], + "source": [ + "async def simple_crawl():\n", + " async with AsyncWebCrawler(verbose=True) as crawler:\n", + " result = await crawler.arun(url=\"https://www.nbcnews.com/business\")\n", + " print(len(result.markdown))\n", + "await simple_crawl()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9rtkgHI28uI4" + }, + "source": [ + "💡 By default, **Crawl4AI** caches the result of every URL, so the next time you call it, you’ll get an instant result. But if you want to bypass the cache, just set `bypass_cache=True`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MzZ0zlJ9yZQM" + }, + "source": [ + "## Advanced Features\n", + "\n", + "### Executing JavaScript and Using CSS Selectors" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gHStF86xyZQM", + "outputId": "34d0fb6d-4dec-4677-f76e-85a1f082829b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LOG] 🌤️ Warming up the AsyncWebCrawler\n", + "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n", + "[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n", + "[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n", + "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 6.06 seconds\n", + "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.10 seconds\n", + "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n", + "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.11 seconds.\n", + "41135\n" + ] + } + ], + "source": [ + "async def js_and_css():\n", + " async with AsyncWebCrawler(verbose=True) as crawler:\n", + " js_code = [\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"]\n", + " result = await crawler.arun(\n", + " url=\"https://www.nbcnews.com/business\",\n", + " js_code=js_code,\n", + " # css_selector=\"YOUR_CSS_SELECTOR_HERE\",\n", + " bypass_cache=True\n", + " )\n", + " print(len(result.markdown))\n", + "\n", + "await js_and_css()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cqE_W4coyZQM" + }, + "source": [ + "### Using a Proxy\n", + "\n", + "Note: You'll need to replace the proxy URL with a working proxy for this example to run successfully." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QjAyiAGqyZQM" + }, + "outputs": [], + "source": [ + "async def use_proxy():\n", + " async with AsyncWebCrawler(verbose=True, proxy=\"http://your-proxy-url:port\") as crawler:\n", + " result = await crawler.arun(\n", + " url=\"https://www.nbcnews.com/business\",\n", + " bypass_cache=True\n", + " )\n", + " print(result.markdown[:500]) # Print first 500 characters\n", + "\n", + "# Uncomment the following line to run the proxy example\n", + "# await use_proxy()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XTZ88lbayZQN" + }, + "source": [ + "### Extracting Structured Data with OpenAI\n", + "\n", + "Note: You'll need to set your OpenAI API key as an environment variable for this example to work." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fIOlDayYyZQN", + "outputId": "cb8359cc-dee0-4762-9698-5dfdcee055b8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LOG] 🌤️ Warming up the AsyncWebCrawler\n", + "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n", + "[LOG] 🕸️ Crawling https://openai.com/api/pricing/ using AsyncPlaywrightCrawlerStrategy...\n", + "[LOG] ✅ Crawled https://openai.com/api/pricing/ successfully!\n", + "[LOG] 🚀 Crawling done for https://openai.com/api/pricing/, success: True, time taken: 3.77 seconds\n", + "[LOG] 🚀 Content extracted for https://openai.com/api/pricing/, success: True, time taken: 0.21 seconds\n", + "[LOG] 🔥 Extracting semantic blocks for https://openai.com/api/pricing/, Strategy: AsyncWebCrawler\n", + "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 0\n", + "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 1\n", + "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 2\n", + "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 3\n", + "[LOG] Extracted 4 blocks from URL: https://openai.com/api/pricing/ block index: 3\n", + "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 4\n", + "[LOG] Extracted 5 blocks from URL: https://openai.com/api/pricing/ block index: 0\n", + "[LOG] Extracted 1 blocks from URL: https://openai.com/api/pricing/ block index: 4\n", + "[LOG] Extracted 8 blocks from URL: https://openai.com/api/pricing/ block index: 1\n", + "[LOG] Extracted 12 blocks from URL: https://openai.com/api/pricing/ block index: 2\n", + "[LOG] 🚀 Extraction done for https://openai.com/api/pricing/, time taken: 8.55 seconds.\n", + "5029\n" + ] + } + ], + "source": [ + "import os\n", + "from google.colab import userdata\n", + "os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n", + "\n", + "class OpenAIModelFee(BaseModel):\n", + " model_name: str = Field(..., description=\"Name of the OpenAI model.\")\n", + " input_fee: str = Field(..., description=\"Fee for input token for the OpenAI model.\")\n", + " output_fee: str = Field(..., description=\"Fee for output token for the OpenAI model.\")\n", + "\n", + "async def extract_openai_fees():\n", + " async with AsyncWebCrawler(verbose=True) as crawler:\n", + " result = await crawler.arun(\n", + " url='https://openai.com/api/pricing/',\n", + " word_count_threshold=1,\n", + " extraction_strategy=LLMExtractionStrategy(\n", + " provider=\"openai/gpt-4o\", api_token=os.getenv('OPENAI_API_KEY'),\n", + " schema=OpenAIModelFee.schema(),\n", + " extraction_type=\"schema\",\n", + " instruction=\"\"\"From the crawled content, extract all mentioned model names along with their fees for input and output tokens.\n", + " Do not miss any models in the entire content. One extracted model JSON format should look like this:\n", + " {\"model_name\": \"GPT-4\", \"input_fee\": \"US$10.00 / 1M tokens\", \"output_fee\": \"US$30.00 / 1M tokens\"}.\"\"\"\n", + " ),\n", + " bypass_cache=True,\n", + " )\n", + " print(len(result.extracted_content))\n", + "\n", + "# Uncomment the following line to run the OpenAI extraction example\n", + "await extract_openai_fees()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BypA5YxEyZQN" + }, + "source": [ + "### Advanced Multi-Page Crawling with JavaScript Execution" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tfkcVQ0b7mw-" + }, + "source": [ + "## Advanced Multi-Page Crawling with JavaScript Execution\n", + "\n", + "This example demonstrates Crawl4AI's ability to handle complex crawling scenarios, specifically extracting commits from multiple pages of a GitHub repository. The challenge here is that clicking the \"Next\" button doesn't load a new page, but instead uses asynchronous JavaScript to update the content. This is a common hurdle in modern web crawling.\n", + "\n", + "To overcome this, we use Crawl4AI's custom JavaScript execution to simulate clicking the \"Next\" button, and implement a custom hook to detect when new data has loaded. Our strategy involves comparing the first commit's text before and after \"clicking\" Next, waiting until it changes to confirm new data has rendered. This showcases Crawl4AI's flexibility in handling dynamic content and its ability to implement custom logic for even the most challenging crawling tasks." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qUBKGpn3yZQN", + "outputId": "3e555b6a-ed33-42f4-cce9-499a923fbe17" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LOG] 🌤️ Warming up the AsyncWebCrawler\n", + "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n", + "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n", + "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n", + "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 5.16 seconds\n", + "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.28 seconds\n", + "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n", + "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.28 seconds.\n", + "Page 1: Found 35 commits\n", + "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n", + "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n", + "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.78 seconds\n", + "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.90 seconds\n", + "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n", + "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.90 seconds.\n", + "Page 2: Found 35 commits\n", + "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n", + "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n", + "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 2.00 seconds\n", + "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.74 seconds\n", + "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n", + "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.75 seconds.\n", + "Page 3: Found 35 commits\n", + "Successfully crawled 105 commits across 3 pages\n" + ] + } + ], + "source": [ + "import re\n", + "from bs4 import BeautifulSoup\n", + "\n", + "async def crawl_typescript_commits():\n", + " first_commit = \"\"\n", + " async def on_execution_started(page):\n", + " nonlocal first_commit\n", + " try:\n", + " while True:\n", + " await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')\n", + " commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')\n", + " commit = await commit.evaluate('(element) => element.textContent')\n", + " commit = re.sub(r'\\s+', '', commit)\n", + " if commit and commit != first_commit:\n", + " first_commit = commit\n", + " break\n", + " await asyncio.sleep(0.5)\n", + " except Exception as e:\n", + " print(f\"Warning: New content didn't appear after JavaScript execution: {e}\")\n", + "\n", + " async with AsyncWebCrawler(verbose=True) as crawler:\n", + " crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)\n", + "\n", + " url = \"https://github.com/microsoft/TypeScript/commits/main\"\n", + " session_id = \"typescript_commits_session\"\n", + " all_commits = []\n", + "\n", + " js_next_page = \"\"\"\n", + " const button = document.querySelector('a[data-testid=\"pagination-next-button\"]');\n", + " if (button) button.click();\n", + " \"\"\"\n", + "\n", + " for page in range(3): # Crawl 3 pages\n", + " result = await crawler.arun(\n", + " url=url,\n", + " session_id=session_id,\n", + " css_selector=\"li.Box-sc-g0xbh4-0\",\n", + " js=js_next_page if page > 0 else None,\n", + " bypass_cache=True,\n", + " js_only=page > 0\n", + " )\n", + "\n", + " assert result.success, f\"Failed to crawl page {page + 1}\"\n", + "\n", + " soup = BeautifulSoup(result.cleaned_html, 'html.parser')\n", + " commits = soup.select(\"li\")\n", + " all_commits.extend(commits)\n", + "\n", + " print(f\"Page {page + 1}: Found {len(commits)} commits\")\n", + "\n", + " await crawler.crawler_strategy.kill_session(session_id)\n", + " print(f\"Successfully crawled {len(all_commits)} commits across 3 pages\")\n", + "\n", + "await crawl_typescript_commits()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EJRnYsp6yZQN" + }, + "source": [ + "### Using JsonCssExtractionStrategy for Fast Structured Output" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1ZMqIzB_8SYp" + }, + "source": [ + "The JsonCssExtractionStrategy is a powerful feature of Crawl4AI that allows for precise, structured data extraction from web pages. Here's how it works:\n", + "\n", + "1. You define a schema that describes the pattern of data you're interested in extracting.\n", + "2. The schema includes a base selector that identifies repeating elements on the page.\n", + "3. Within the schema, you define fields, each with its own selector and type.\n", + "4. These field selectors are applied within the context of each base selector element.\n", + "5. The strategy supports nested structures, lists within lists, and various data types.\n", + "6. You can even include computed fields for more complex data manipulation.\n", + "\n", + "This approach allows for highly flexible and precise data extraction, transforming semi-structured web content into clean, structured JSON data. It's particularly useful for extracting consistent data patterns from pages like product listings, news articles, or search results.\n", + "\n", + "For more details and advanced usage, check out the full documentation on the Crawl4AI website." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "trCMR2T9yZQN", + "outputId": "718d36f4-cccf-40f4-8d8c-c3ba73524d16" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LOG] 🌤️ Warming up the AsyncWebCrawler\n", + "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n", + "[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n", + "[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n", + "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 7.00 seconds\n", + "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.32 seconds\n", + "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n", + "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.48 seconds.\n", + "Successfully extracted 11 news teasers\n", + "{\n", + " \"category\": \"Business News\",\n", + " \"headline\": \"NBC ripped up its Olympics playbook for 2024 \\u2014 so far, the new strategy paid off\",\n", + " \"summary\": \"The Olympics have long been key to NBCUniversal. Paris marked the 18th Olympic Games broadcast by NBC in the U.S.\",\n", + " \"time\": \"13h ago\",\n", + " \"image\": {\n", + " \"src\": \"https://media-cldnry.s-nbcnews.com/image/upload/t_focal-200x100,f_auto,q_auto:best/rockcms/2024-09/240903-nbc-olympics-ch-1344-c7a486.jpg\",\n", + " \"alt\": \"Mike Tirico.\"\n", + " },\n", + " \"link\": \"https://www.nbcnews.com/business\"\n", + "}\n" + ] + } + ], + "source": [ + "async def extract_news_teasers():\n", + " schema = {\n", + " \"name\": \"News Teaser Extractor\",\n", + " \"baseSelector\": \".wide-tease-item__wrapper\",\n", + " \"fields\": [\n", + " {\n", + " \"name\": \"category\",\n", + " \"selector\": \".unibrow span[data-testid='unibrow-text']\",\n", + " \"type\": \"text\",\n", + " },\n", + " {\n", + " \"name\": \"headline\",\n", + " \"selector\": \".wide-tease-item__headline\",\n", + " \"type\": \"text\",\n", + " },\n", + " {\n", + " \"name\": \"summary\",\n", + " \"selector\": \".wide-tease-item__description\",\n", + " \"type\": \"text\",\n", + " },\n", + " {\n", + " \"name\": \"time\",\n", + " \"selector\": \"[data-testid='wide-tease-date']\",\n", + " \"type\": \"text\",\n", + " },\n", + " {\n", + " \"name\": \"image\",\n", + " \"type\": \"nested\",\n", + " \"selector\": \"picture.teasePicture img\",\n", + " \"fields\": [\n", + " {\"name\": \"src\", \"type\": \"attribute\", \"attribute\": \"src\"},\n", + " {\"name\": \"alt\", \"type\": \"attribute\", \"attribute\": \"alt\"},\n", + " ],\n", + " },\n", + " {\n", + " \"name\": \"link\",\n", + " \"selector\": \"a[href]\",\n", + " \"type\": \"attribute\",\n", + " \"attribute\": \"href\",\n", + " },\n", + " ],\n", + " }\n", + "\n", + " extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)\n", + "\n", + " async with AsyncWebCrawler(verbose=True) as crawler:\n", + " result = await crawler.arun(\n", + " url=\"https://www.nbcnews.com/business\",\n", + " extraction_strategy=extraction_strategy,\n", + " bypass_cache=True,\n", + " )\n", + "\n", + " assert result.success, \"Failed to crawl the page\"\n", + "\n", + " news_teasers = json.loads(result.extracted_content)\n", + " print(f\"Successfully extracted {len(news_teasers)} news teasers\")\n", + " print(json.dumps(news_teasers[0], indent=2))\n", + "\n", + "await extract_news_teasers()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FnyVhJaByZQN" + }, + "source": [ + "## Speed Comparison\n", + "\n", + "Let's compare the speed of Crawl4AI with Firecrawl, a paid service. Note that we can't run Firecrawl in this Colab environment, so we'll simulate its performance based on previously recorded data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "agDD186f3wig" + }, + "source": [ + "💡 **Note on Speed Comparison:**\n", + "\n", + "The speed test conducted here is running on Google Colab, where the internet speed and performance can vary and may not reflect optimal conditions. When we call Firecrawl's API, we're seeing its best performance, while Crawl4AI's performance is limited by Colab's network speed.\n", + "\n", + "For a more accurate comparison, it's recommended to run these tests on your own servers or computers with a stable and fast internet connection. Despite these limitations, Crawl4AI still demonstrates faster performance in this environment.\n", + "\n", + "If you run these tests locally, you may observe an even more significant speed advantage for Crawl4AI compared to other services." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "F7KwHv8G1LbY" + }, + "outputs": [], + "source": [ + "!pip install firecrawl" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "91813zILyZQN", + "outputId": "663223db-ab89-4976-b233-05ceca62b19b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Firecrawl (simulated):\n", + "Time taken: 4.38 seconds\n", + "Content length: 41967 characters\n", + "Images found: 49\n", + "\n", + "Crawl4AI (simple crawl):\n", + "Time taken: 4.22 seconds\n", + "Content length: 18221 characters\n", + "Images found: 49\n", + "\n", + "Crawl4AI (with JavaScript execution):\n", + "Time taken: 9.13 seconds\n", + "Content length: 34243 characters\n", + "Images found: 89\n" + ] + } + ], + "source": [ + "import os\n", + "from google.colab import userdata\n", + "os.environ['FIRECRAWL_API_KEY'] = userdata.get('FIRECRAWL_API_KEY')\n", + "import time\n", + "from firecrawl import FirecrawlApp\n", + "\n", + "async def speed_comparison():\n", + " # Simulated Firecrawl performance\n", + " app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])\n", + " start = time.time()\n", + " scrape_status = app.scrape_url(\n", + " 'https://www.nbcnews.com/business',\n", + " params={'formats': ['markdown', 'html']}\n", + " )\n", + " end = time.time()\n", + " print(\"Firecrawl (simulated):\")\n", + " print(f\"Time taken: {end - start:.2f} seconds\")\n", + " print(f\"Content length: {len(scrape_status['markdown'])} characters\")\n", + " print(f\"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}\")\n", + " print()\n", + "\n", + " async with AsyncWebCrawler() as crawler:\n", + " # Crawl4AI simple crawl\n", + " start = time.time()\n", + " result = await crawler.arun(\n", + " url=\"https://www.nbcnews.com/business\",\n", + " word_count_threshold=0,\n", + " bypass_cache=True,\n", + " verbose=False\n", + " )\n", + " end = time.time()\n", + " print(\"Crawl4AI (simple crawl):\")\n", + " print(f\"Time taken: {end - start:.2f} seconds\")\n", + " print(f\"Content length: {len(result.markdown)} characters\")\n", + " print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n", + " print()\n", + "\n", + " # Crawl4AI with JavaScript execution\n", + " start = time.time()\n", + " result = await crawler.arun(\n", + " url=\"https://www.nbcnews.com/business\",\n", + " js_code=[\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"],\n", + " word_count_threshold=0,\n", + " bypass_cache=True,\n", + " verbose=False\n", + " )\n", + " end = time.time()\n", + " print(\"Crawl4AI (with JavaScript execution):\")\n", + " print(f\"Time taken: {end - start:.2f} seconds\")\n", + " print(f\"Content length: {len(result.markdown)} characters\")\n", + " print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n", + "\n", + "await speed_comparison()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OBFFYVJIyZQN" + }, + "source": [ + "If you run on a local machine with a proper internet speed:\n", + "- Simple crawl: Crawl4AI is typically over 3-4 times faster than Firecrawl.\n", + "- With JavaScript execution: Even when executing JavaScript to load more content (potentially doubling the number of images found), Crawl4AI is still faster than Firecrawl's simple crawl.\n", + "\n", + "Please note that actual performance may vary depending on network conditions and the specific content being crawled." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A6_1RK1_yZQO" + }, + "source": [ + "## Conclusion\n", + "\n", + "In this notebook, we've explored the powerful features of Crawl4AI, including:\n", + "\n", + "1. Basic crawling\n", + "2. JavaScript execution and CSS selector usage\n", + "3. Proxy support\n", + "4. Structured data extraction with OpenAI\n", + "5. Advanced multi-page crawling with JavaScript execution\n", + "6. Fast structured output using JsonCssExtractionStrategy\n", + "7. Speed comparison with other services\n", + "\n", + "Crawl4AI offers a fast, flexible, and powerful solution for web crawling and data extraction tasks. Its asynchronous architecture and advanced features make it suitable for a wide range of applications, from simple web scraping to complex, multi-page data extraction scenarios.\n", + "\n", + "For more information and advanced usage, please visit the [Crawl4AI documentation](https://crawl4ai.com/mkdocs/).\n", + "\n", + "Happy crawling!" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/examples/research_assistant.py b/docs/examples/research_assistant.py new file mode 100644 index 0000000000000000000000000000000000000000..de35ce8455606cac13fda96fc6a44f006a9a6822 --- /dev/null +++ b/docs/examples/research_assistant.py @@ -0,0 +1,195 @@ +# Make sure to install the required packageschainlit and groq +import os, time +from openai import AsyncOpenAI +import chainlit as cl +import re +import requests +from io import BytesIO +from chainlit.element import ElementBased +from groq import Groq + +# Import threadpools to run the crawl_url function in a separate thread +from concurrent.futures import ThreadPoolExecutor + +client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY")) + +# Instrument the OpenAI client +cl.instrument_openai() + +settings = { + "model": "llama3-8b-8192", + "temperature": 0.5, + "max_tokens": 500, + "top_p": 1, + "frequency_penalty": 0, + "presence_penalty": 0, +} + +def extract_urls(text): + url_pattern = re.compile(r'(https?://\S+)') + return url_pattern.findall(text) + +def crawl_url(url): + data = { + "urls": [url], + "include_raw_html": True, + "word_count_threshold": 10, + "extraction_strategy": "NoExtractionStrategy", + "chunking_strategy": "RegexChunking" + } + response = requests.post("https://crawl4ai.com/crawl", json=data) + response_data = response.json() + response_data = response_data['results'][0] + return response_data['markdown'] + +@cl.on_chat_start +async def on_chat_start(): + cl.user_session.set("session", { + "history": [], + "context": {} + }) + await cl.Message( + content="Welcome to the chat! How can I assist you today?" + ).send() + +@cl.on_message +async def on_message(message: cl.Message): + user_session = cl.user_session.get("session") + + # Extract URLs from the user's message + urls = extract_urls(message.content) + + + futures = [] + with ThreadPoolExecutor() as executor: + for url in urls: + futures.append(executor.submit(crawl_url, url)) + + results = [future.result() for future in futures] + + for url, result in zip(urls, results): + ref_number = f"REF_{len(user_session['context']) + 1}" + user_session["context"][ref_number] = { + "url": url, + "content": result + } + + + user_session["history"].append({ + "role": "user", + "content": message.content + }) + + # Create a system message that includes the context + context_messages = [ + f'\n{data["content"]}\n' + for ref, data in user_session["context"].items() + ] + if context_messages: + system_message = { + "role": "system", + "content": ( + "You are a helpful bot. Use the following context for answering questions. " + "Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n" + "If the question requires any information from the provided appendices or context, refer to the sources. " + "If not, there is no need to add a references section. " + "At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n" + "\n\n".join(context_messages) + ) + } + else: + system_message = { + "role": "system", + "content": "You are a helpful assistant." + } + + + msg = cl.Message(content="") + await msg.send() + + # Get response from the LLM + stream = await client.chat.completions.create( + messages=[ + system_message, + *user_session["history"] + ], + stream=True, + **settings + ) + + assistant_response = "" + async for part in stream: + if token := part.choices[0].delta.content: + assistant_response += token + await msg.stream_token(token) + + # Add assistant message to the history + user_session["history"].append({ + "role": "assistant", + "content": assistant_response + }) + await msg.update() + + # Append the reference section to the assistant's response + reference_section = "\n\nReferences:\n" + for ref, data in user_session["context"].items(): + reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n" + + msg.content += reference_section + await msg.update() + + +@cl.on_audio_chunk +async def on_audio_chunk(chunk: cl.AudioChunk): + if chunk.isStart: + buffer = BytesIO() + # This is required for whisper to recognize the file type + buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}" + # Initialize the session for a new audio stream + cl.user_session.set("audio_buffer", buffer) + cl.user_session.set("audio_mime_type", chunk.mimeType) + + # Write the chunks to a buffer and transcribe the whole audio at the end + cl.user_session.get("audio_buffer").write(chunk.data) + + pass + +@cl.step(type="tool") +async def speech_to_text(audio_file): + cli = Groq() + + response = await client.audio.transcriptions.create( + model="whisper-large-v3", file=audio_file + ) + + return response.text + + +@cl.on_audio_end +async def on_audio_end(elements: list[ElementBased]): + # Get the audio buffer from the session + audio_buffer: BytesIO = cl.user_session.get("audio_buffer") + audio_buffer.seek(0) # Move the file pointer to the beginning + audio_file = audio_buffer.read() + audio_mime_type: str = cl.user_session.get("audio_mime_type") + + start_time = time.time() + whisper_input = (audio_buffer.name, audio_file, audio_mime_type) + transcription = await speech_to_text(whisper_input) + end_time = time.time() + print(f"Transcription took {end_time - start_time} seconds") + + user_msg = cl.Message( + author="You", + type="user_message", + content=transcription + ) + await user_msg.send() + await on_message(user_msg) + + +if __name__ == "__main__": + from chainlit.cli import run_chainlit + run_chainlit(__file__) + + diff --git a/docs/examples/rest_call.py b/docs/examples/rest_call.py new file mode 100644 index 0000000000000000000000000000000000000000..465c61142992d340e7275a6efd4b8c7627cefb17 --- /dev/null +++ b/docs/examples/rest_call.py @@ -0,0 +1,64 @@ + +import requests, base64, os + +data = { + "urls": ["https://www.nbcnews.com/business"], + "screenshot": True, +} + +response = requests.post("https://crawl4ai.com/crawl", json=data) +result = response.json()['results'][0] +print(result.keys()) +# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media', +# 'links', 'screenshot', 'markdown', 'extracted_content', +# 'metadata', 'error_message']) +with open("screenshot.png", "wb") as f: + f.write(base64.b64decode(result['screenshot'])) + +# Example of filtering the content using CSS selectors +data = { + "urls": [ + "https://www.nbcnews.com/business" + ], + "css_selector": "article", + "screenshot": True, +} + +# Example of executing a JS script on the page before extracting the content +data = { + "urls": [ + "https://www.nbcnews.com/business" + ], + "screenshot": True, + 'js' : [""" + const loadMoreButton = Array.from(document.querySelectorAll('button')). + find(button => button.textContent.includes('Load More')); + loadMoreButton && loadMoreButton.click(); + """] +} + +# Example of using a custom extraction strategy +data = { + "urls": [ + "https://www.nbcnews.com/business" + ], + "extraction_strategy": "CosineStrategy", + "extraction_strategy_args": { + "semantic_filter": "inflation rent prices" + }, +} + +# Example of using LLM to extract content +data = { + "urls": [ + "https://www.nbcnews.com/business" + ], + "extraction_strategy": "LLMExtractionStrategy", + "extraction_strategy_args": { + "provider": "groq/llama3-8b-8192", + "api_token": os.environ.get("GROQ_API_KEY"), + "instruction": """I am interested in only financial news, + and translate them in French.""" + }, +} + diff --git a/docs/examples/sample_ecommerce.html b/docs/examples/sample_ecommerce.html new file mode 100644 index 0000000000000000000000000000000000000000..4698d9c69ba6ea36fd4709b7daca3ab78ec7f0fb --- /dev/null +++ b/docs/examples/sample_ecommerce.html @@ -0,0 +1,106 @@ + + + + + + Sample E-commerce Page for JsonCssExtractionStrategy Testing + + + +

    Sample E-commerce Product Catalog

    +
    + + + + \ No newline at end of file diff --git a/docs/examples/ssl_example.py b/docs/examples/ssl_example.py new file mode 100644 index 0000000000000000000000000000000000000000..410e9485e4b17a1dd2af901f32eea283d6db24ea --- /dev/null +++ b/docs/examples/ssl_example.py @@ -0,0 +1,46 @@ +"""Example showing how to work with SSL certificates in Crawl4AI.""" + +import asyncio +import os +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +# Create tmp directory if it doesn't exist +parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +tmp_dir = os.path.join(parent_dir, "tmp") +os.makedirs(tmp_dir, exist_ok=True) + +async def main(): + # Configure crawler to fetch SSL certificate + config = CrawlerRunConfig( + fetch_ssl_certificate=True, + cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url='https://example.com', + config=config + ) + + if result.success and result.ssl_certificate: + cert = result.ssl_certificate + + # 1. Access certificate properties directly + print("\nCertificate Information:") + print(f"Issuer: {cert.issuer.get('CN', '')}") + print(f"Valid until: {cert.valid_until}") + print(f"Fingerprint: {cert.fingerprint}") + + # 2. Export certificate in different formats + cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis + print("\nCertificate exported to:") + print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}") + + pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers + print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}") + + der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps + print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/storage_state_tutorial.md b/docs/examples/storage_state_tutorial.md new file mode 100644 index 0000000000000000000000000000000000000000..304e6399ad906f66c103b9b5b7789fb76e369d00 --- /dev/null +++ b/docs/examples/storage_state_tutorial.md @@ -0,0 +1,225 @@ +### Using `storage_state` to Pre-Load Cookies and LocalStorage + +Crawl4ai’s `AsyncWebCrawler` lets you preserve and reuse session data, including cookies and localStorage, across multiple runs. By providing a `storage_state`, you can start your crawls already “logged in” or with any other necessary session data—no need to repeat the login flow every time. + +#### What is `storage_state`? + +`storage_state` can be: + +- A dictionary containing cookies and localStorage data. +- A path to a JSON file that holds this information. + +When you pass `storage_state` to the crawler, it applies these cookies and localStorage entries before loading any pages. This means your crawler effectively starts in a known authenticated or pre-configured state. + +#### Example Structure + +Here’s an example storage state: + +```json +{ + "cookies": [ + { + "name": "session", + "value": "abcd1234", + "domain": "example.com", + "path": "/", + "expires": 1675363572.037711, + "httpOnly": false, + "secure": false, + "sameSite": "None" + } + ], + "origins": [ + { + "origin": "https://example.com", + "localStorage": [ + { "name": "token", "value": "my_auth_token" }, + { "name": "refreshToken", "value": "my_refresh_token" } + ] + } + ] +} +``` + +This JSON sets a `session` cookie and two localStorage entries (`token` and `refreshToken`) for `https://example.com`. + +--- + +### Passing `storage_state` as a Dictionary + +You can directly provide the data as a dictionary: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + storage_dict = { + "cookies": [ + { + "name": "session", + "value": "abcd1234", + "domain": "example.com", + "path": "/", + "expires": 1675363572.037711, + "httpOnly": False, + "secure": False, + "sameSite": "None" + } + ], + "origins": [ + { + "origin": "https://example.com", + "localStorage": [ + {"name": "token", "value": "my_auth_token"}, + {"name": "refreshToken", "value": "my_refresh_token"} + ] + } + ] + } + + async with AsyncWebCrawler( + headless=True, + storage_state=storage_dict + ) as crawler: + result = await crawler.arun(url='https://example.com/protected') + if result.success: + print("Crawl succeeded with pre-loaded session data!") + print("Page HTML length:", len(result.html)) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +### Passing `storage_state` as a File + +If you prefer a file-based approach, save the JSON above to `mystate.json` and reference it: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + async with AsyncWebCrawler( + headless=True, + storage_state="mystate.json" # Uses a JSON file instead of a dictionary + ) as crawler: + result = await crawler.arun(url='https://example.com/protected') + if result.success: + print("Crawl succeeded with pre-loaded session data!") + print("Page HTML length:", len(result.html)) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +### Using `storage_state` to Avoid Repeated Logins (Sign In Once, Use Later) + +A common scenario is when you need to log in to a site (entering username/password, etc.) to access protected pages. Doing so every crawl is cumbersome. Instead, you can: + +1. Perform the login once in a hook. +2. After login completes, export the resulting `storage_state` to a file. +3. On subsequent runs, provide that `storage_state` to skip the login step. + +**Step-by-Step Example:** + +**First Run (Perform Login and Save State):** + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def on_browser_created_hook(browser): + # Access the default context and create a page + context = browser.contexts[0] + page = await context.new_page() + + # Navigate to the login page + await page.goto("https://example.com/login", wait_until="domcontentloaded") + + # Fill in credentials and submit + await page.fill("input[name='username']", "myuser") + await page.fill("input[name='password']", "mypassword") + await page.click("button[type='submit']") + await page.wait_for_load_state("networkidle") + + # Now the site sets tokens in localStorage and cookies + # Export this state to a file so we can reuse it + await context.storage_state(path="my_storage_state.json") + await page.close() + +async def main(): + # First run: perform login and export the storage_state + async with AsyncWebCrawler( + headless=True, + verbose=True, + hooks={"on_browser_created": on_browser_created_hook}, + use_persistent_context=True, + user_data_dir="./my_user_data" + ) as crawler: + + # After on_browser_created_hook runs, we have storage_state saved to my_storage_state.json + result = await crawler.arun( + url='https://example.com/protected-page', + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}), + ) + print("First run result success:", result.success) + if result.success: + print("Protected page HTML length:", len(result.html)) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Second Run (Reuse Saved State, No Login Needed):** + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def main(): + # Second run: no need to hook on_browser_created this time. + # Just provide the previously saved storage state. + async with AsyncWebCrawler( + headless=True, + verbose=True, + use_persistent_context=True, + user_data_dir="./my_user_data", + storage_state="my_storage_state.json" # Reuse previously exported state + ) as crawler: + + # Now the crawler starts already logged in + result = await crawler.arun( + url='https://example.com/protected-page', + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}), + ) + print("Second run result success:", result.success) + if result.success: + print("Protected page HTML length:", len(result.html)) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**What’s Happening Here?** + +- During the first run, the `on_browser_created_hook` logs into the site. +- After logging in, the crawler exports the current session (cookies, localStorage, etc.) to `my_storage_state.json`. +- On subsequent runs, passing `storage_state="my_storage_state.json"` starts the browser context with these tokens already in place, skipping the login steps. + +**Sign Out Scenario:** +If the website allows you to sign out by clearing tokens or by navigating to a sign-out URL, you can also run a script that uses `on_browser_created_hook` or `arun` to simulate signing out, then export the resulting `storage_state` again. That would give you a baseline “logged out” state to start fresh from next time. + +--- + +### Conclusion + +By using `storage_state`, you can skip repetitive actions, like logging in, and jump straight into crawling protected content. Whether you provide a file path or a dictionary, this powerful feature helps maintain state between crawls, simplifying your data extraction pipelines. \ No newline at end of file diff --git a/docs/examples/summarize_page.py b/docs/examples/summarize_page.py new file mode 100644 index 0000000000000000000000000000000000000000..8515899970a3d02301c62803dd99999a0cfa42be --- /dev/null +++ b/docs/examples/summarize_page.py @@ -0,0 +1,46 @@ +import os +import time +import json +from crawl4ai.web_crawler import WebCrawler +from crawl4ai.chunking_strategy import * +from crawl4ai.extraction_strategy import * +from crawl4ai.crawler_strategy import * + +url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot' + +crawler = WebCrawler() +crawler.warmup() + +from pydantic import BaseModel, Field + +class PageSummary(BaseModel): + title: str = Field(..., description="Title of the page.") + summary: str = Field(..., description="Summary of the page.") + brief_summary: str = Field(..., description="Brief summary of the page.") + keywords: list = Field(..., description="Keywords assigned to the page.") + +result = crawler.run( + url=url, + word_count_threshold=1, + extraction_strategy= LLMExtractionStrategy( + provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), + schema=PageSummary.model_json_schema(), + extraction_type="schema", + apply_chunking =False, + instruction="From the crawled content, extract the following details: "\ + "1. Title of the page "\ + "2. Summary of the page, which is a detailed summary "\ + "3. Brief summary of the page, which is a paragraph text "\ + "4. Keywords assigned to the page, which is a list of keywords. "\ + 'The extracted JSON format should look like this: '\ + '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }' + ), + bypass_cache=True, +) + +page_summary = json.loads(result.extracted_content) + +print(page_summary) + +with open(".data/page_summary.json", "w", encoding="utf-8") as f: + f.write(result.extracted_content) diff --git a/docs/examples/tutorial_dynamic_clicks.md b/docs/examples/tutorial_dynamic_clicks.md new file mode 100644 index 0000000000000000000000000000000000000000..d9669952b4a63e47d71ea71d9396d3d77ae3888b --- /dev/null +++ b/docs/examples/tutorial_dynamic_clicks.md @@ -0,0 +1,117 @@ +# Tutorial: Clicking Buttons to Load More Content with Crawl4AI + +## Introduction + +When scraping dynamic websites, it’s common to encounter “Load More” or “Next” buttons that must be clicked to reveal new content. Crawl4AI provides a straightforward way to handle these situations using JavaScript execution and waiting conditions. In this tutorial, we’ll cover two approaches: + +1. **Step-by-step (Session-based) Approach:** Multiple calls to `arun()` to progressively load more content. +2. **Single-call Approach:** Execute a more complex JavaScript snippet inside a single `arun()` call to handle all clicks at once before the extraction. + +## Prerequisites + +- A working installation of Crawl4AI +- Basic familiarity with Python’s `async`/`await` syntax + +## Step-by-Step Approach + +Use a session ID to maintain state across multiple `arun()` calls: + +```python +from crawl4ai import AsyncWebCrawler, CacheMode + +js_code = [ + # This JS finds the “Next” button and clicks it + "const nextButton = document.querySelector('button.next'); nextButton && nextButton.click();" +] + +wait_for_condition = "css:.new-content-class" + +async with AsyncWebCrawler(headless=True, verbose=True) as crawler: + # 1. Load the initial page + result_initial = await crawler.arun( + url="https://example.com", + cache_mode=CacheMode.BYPASS, + session_id="my_session" + ) + + # 2. Click the 'Next' button and wait for new content + result_next = await crawler.arun( + url="https://example.com", + session_id="my_session", + js_code=js_code, + wait_for=wait_for_condition, + js_only=True, + cache_mode=CacheMode.BYPASS + ) + +# `result_next` now contains the updated HTML after clicking 'Next' +``` + +**Key Points:** +- **`session_id`**: Keeps the same browser context open. +- **`js_code`**: Executes JavaScript in the context of the already loaded page. +- **`wait_for`**: Ensures the crawler waits until new content is fully loaded. +- **`js_only=True`**: Runs the JS in the current session without reloading the page. + +By repeating the `arun()` call multiple times and modifying the `js_code` (e.g., clicking different modules or pages), you can iteratively load all the desired content. + +## Single-call Approach + +If the page allows it, you can run a single `arun()` call with a more elaborate JavaScript snippet that: +- Iterates over all the modules or "Next" buttons +- Clicks them one by one +- Waits for content updates between each click +- Once done, returns control to Crawl4AI for extraction. + +Example snippet: + +```python +from crawl4ai import AsyncWebCrawler, CacheMode + +js_code = [ + # Example JS that clicks multiple modules: + """ + (async () => { + const modules = document.querySelectorAll('.module-item'); + for (let i = 0; i < modules.length; i++) { + modules[i].scrollIntoView(); + modules[i].click(); + // Wait for each module’s content to load, adjust 100ms as needed + await new Promise(r => setTimeout(r, 100)); + } + })(); + """ +] + +async with AsyncWebCrawler(headless=True, verbose=True) as crawler: + result = await crawler.arun( + url="https://example.com", + js_code=js_code, + wait_for="css:.final-loaded-content-class", + cache_mode=CacheMode.BYPASS + ) + +# `result` now contains all content after all modules have been clicked in one go. +``` + +**Key Points:** +- All interactions (clicks and waits) happen before the extraction. +- Ideal for pages where all steps can be done in a single pass. + +## Choosing the Right Approach + +- **Step-by-Step (Session-based)**: + - Good when you need fine-grained control or must dynamically check conditions before clicking the next page. + - Useful if the page requires multiple conditions checked at runtime. + +- **Single-call**: + - Perfect if the sequence of interactions is known in advance. + - Cleaner code if the page’s structure is consistent and predictable. + +## Conclusion + +Crawl4AI makes it easy to handle dynamic content: +- Use session IDs and multiple `arun()` calls for stepwise crawling. +- Or pack all actions into one `arun()` call if the interactions are well-defined upfront. + +This flexibility ensures you can handle a wide range of dynamic web pages efficiently. diff --git a/docs/examples/v0.3.74.overview.py b/docs/examples/v0.3.74.overview.py new file mode 100644 index 0000000000000000000000000000000000000000..362ae8fc4446a9173e0dd91c6f0f9d995c6b7d53 --- /dev/null +++ b/docs/examples/v0.3.74.overview.py @@ -0,0 +1,277 @@ +import os, sys +# append the parent directory to the sys.path +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) +parent_parent_dir = os.path.dirname(parent_dir) +sys.path.append(parent_parent_dir) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) +__data__ = os.path.join(__location__, "__data") +import asyncio +from pathlib import Path +import aiohttp +import json +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.content_filter_strategy import BM25ContentFilter + +# 1. File Download Processing Example +async def download_example(): + """Example of downloading files from Python.org""" + # downloads_path = os.path.join(os.getcwd(), "downloads") + downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads") + os.makedirs(downloads_path, exist_ok=True) + + print(f"Downloads will be saved to: {downloads_path}") + + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=downloads_path, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + // Find and click the first Windows installer link + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) { + console.log('Found download link:', downloadLink.href); + downloadLink.click(); + } else { + console.log('No .exe download link found'); + } + """, + delay_before_return_html=1, # Wait 5 seconds to ensure download starts + cache_mode=CacheMode.BYPASS + ) + + if result.downloaded_files: + print("\nDownload successful!") + print("Downloaded files:") + for file_path in result.downloaded_files: + print(f"- {file_path}") + print(f" File size: {os.path.getsize(file_path) / (1024*1024):.2f} MB") + else: + print("\nNo files were downloaded") + +# 2. Local File and Raw HTML Processing Example +async def local_and_raw_html_example(): + """Example of processing local files and raw HTML""" + # Create a sample HTML file + sample_file = os.path.join(__data__, "sample.html") + with open(sample_file, "w") as f: + f.write(""" + +

    Test Content

    +

    This is a test paragraph.

    + + """) + + async with AsyncWebCrawler(verbose=True) as crawler: + # Process local file + local_result = await crawler.arun( + url=f"file://{os.path.abspath(sample_file)}" + ) + + # Process raw HTML + raw_html = """ + +

    Raw HTML Test

    +

    This is a test of raw HTML processing.

    + + """ + raw_result = await crawler.arun( + url=f"raw:{raw_html}" + ) + + # Clean up + os.remove(sample_file) + + print("Local file content:", local_result.markdown) + print("\nRaw HTML content:", raw_result.markdown) + +# 3. Enhanced Markdown Generation Example +async def markdown_generation_example(): + """Example of enhanced markdown generation with citations and LLM-friendly features""" + async with AsyncWebCrawler(verbose=True) as crawler: + # Create a content filter (optional) + content_filter = BM25ContentFilter( + # user_query="History and cultivation", + bm25_threshold=1.0 + ) + + result = await crawler.arun( + url="https://en.wikipedia.org/wiki/Apple", + css_selector="main div#bodyContent", + content_filter=content_filter, + cache_mode=CacheMode.BYPASS + ) + + from crawl4ai import AsyncWebCrawler + from crawl4ai.content_filter_strategy import BM25ContentFilter + + result = await crawler.arun( + url="https://en.wikipedia.org/wiki/Apple", + css_selector="main div#bodyContent", + content_filter=BM25ContentFilter() + ) + print(result.markdown_v2.fit_markdown) + + print("\nMarkdown Generation Results:") + print(f"1. Original markdown length: {len(result.markdown)}") + print(f"2. New markdown versions (markdown_v2):") + print(f" - Raw markdown length: {len(result.markdown_v2.raw_markdown)}") + print(f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}") + print(f" - References section length: {len(result.markdown_v2.references_markdown)}") + if result.markdown_v2.fit_markdown: + print(f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}") + + # Save examples to files + output_dir = os.path.join(__data__, "markdown_examples") + os.makedirs(output_dir, exist_ok=True) + + # Save different versions + with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f: + f.write(result.markdown_v2.raw_markdown) + + with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f: + f.write(result.markdown_v2.markdown_with_citations) + + with open(os.path.join(output_dir, "3_references.md"), "w") as f: + f.write(result.markdown_v2.references_markdown) + + if result.markdown_v2.fit_markdown: + with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f: + f.write(result.markdown_v2.fit_markdown) + + print(f"\nMarkdown examples saved to: {output_dir}") + + # Show a sample of citations and references + print("\nSample of markdown with citations:") + print(result.markdown_v2.markdown_with_citations[:500] + "...\n") + print("Sample of references:") + print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...") + +# 4. Browser Management Example +async def browser_management_example(): + """Example of using enhanced browser management features""" + # Use the specified user directory path + user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile") + os.makedirs(user_data_dir, exist_ok=True) + + print(f"Browser profile will be saved to: {user_data_dir}") + + async with AsyncWebCrawler( + use_managed_browser=True, + user_data_dir=user_data_dir, + headless=False, + verbose=True + ) as crawler: + + result = await crawler.arun( + url="https://crawl4ai.com", + # session_id="persistent_session_1", + cache_mode=CacheMode.BYPASS + ) + # Use GitHub as an example - it's a good test for browser management + # because it requires proper browser handling + result = await crawler.arun( + url="https://github.com/trending", + # session_id="persistent_session_1", + cache_mode=CacheMode.BYPASS + ) + + print("\nBrowser session result:", result.success) + if result.success: + print("Page title:", result.metadata.get('title', 'No title found')) + +# 5. API Usage Example +async def api_example(): + """Example of using the new API endpoints""" + api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" + headers = {'Authorization': f'Bearer {api_token}'} + async with aiohttp.ClientSession() as session: + # Submit crawl job + crawl_request = { + "urls": ["https://news.ycombinator.com"], # Hacker News as an example + "extraction_config": { + "type": "json_css", + "params": { + "schema": { + "name": "Hacker News Articles", + "baseSelector": ".athing", + "fields": [ + { + "name": "title", + "selector": ".title a", + "type": "text" + }, + { + "name": "score", + "selector": ".score", + "type": "text" + }, + { + "name": "url", + "selector": ".title a", + "type": "attribute", + "attribute": "href" + } + ] + } + } + }, + "crawler_params": { + "headless": True, + # "use_managed_browser": True + }, + "cache_mode": "bypass", + # "screenshot": True, + # "magic": True + } + + async with session.post( + "http://localhost:11235/crawl", + json=crawl_request, + headers=headers + ) as response: + task_data = await response.json() + task_id = task_data["task_id"] + + # Check task status + while True: + async with session.get( + f"http://localhost:11235/task/{task_id}", + headers=headers + ) as status_response: + result = await status_response.json() + print(f"Task status: {result['status']}") + + if result["status"] == "completed": + print("Task completed!") + print("Results:") + news = json.loads(result["results"][0]['extracted_content']) + print(json.dumps(news[:4], indent=2)) + break + else: + await asyncio.sleep(1) + +# Main execution +async def main(): + # print("Running Crawl4AI feature examples...") + + # print("\n1. Running Download Example:") + # await download_example() + + # print("\n2. Running Markdown Generation Example:") + # await markdown_generation_example() + + # # print("\n3. Running Local and Raw HTML Example:") + # await local_and_raw_html_example() + + # # print("\n4. Running Browser Management Example:") + await browser_management_example() + + # print("\n5. Running API Example:") + await api_example() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/v0_4_24_walkthrough.py b/docs/examples/v0_4_24_walkthrough.py new file mode 100644 index 0000000000000000000000000000000000000000..135ac29c7ef9f4b75d328f3583675867d66367e0 --- /dev/null +++ b/docs/examples/v0_4_24_walkthrough.py @@ -0,0 +1,443 @@ +""" +Crawl4AI v0.4.24 Feature Walkthrough +=================================== + +This script demonstrates the new features introduced in Crawl4AI v0.4.24. +Each section includes detailed examples and explanations of the new capabilities. +""" + +import asyncio +import os +import json +import re +from typing import List, Optional, Dict, Any +from pydantic import BaseModel, Field +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + CacheMode, + LLMExtractionStrategy, + JsonCssExtractionStrategy +) +from crawl4ai.content_filter_strategy import RelevantContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from bs4 import BeautifulSoup + +# Sample HTML for demonstrations +SAMPLE_HTML = """ +
    + + +
    +""" + +async def demo_ssl_features(): + """ + Enhanced SSL & Security Features Demo + ----------------------------------- + + This example demonstrates the new SSL certificate handling and security features: + 1. Custom certificate paths + 2. SSL verification options + 3. HTTPS error handling + 4. Certificate validation configurations + + These features are particularly useful when: + - Working with self-signed certificates + - Dealing with corporate proxies + - Handling mixed content websites + - Managing different SSL security levels + """ + print("\n1. Enhanced SSL & Security Demo") + print("--------------------------------") + + browser_config = BrowserConfig() + + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + fetch_ssl_certificate=True # Enable SSL certificate fetching + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_config + ) + print(f"SSL Crawl Success: {result.success}") + result.ssl_certificate.to_json( + os.path.join(os.getcwd(), "ssl_certificate.json") + ) + if not result.success: + print(f"SSL Error: {result.error_message}") + +async def demo_content_filtering(): + """ + Smart Content Filtering Demo + ---------------------- + + Demonstrates advanced content filtering capabilities: + 1. Custom filter to identify and extract specific content + 2. Integration with markdown generation + 3. Flexible pruning rules + """ + print("\n2. Smart Content Filtering Demo") + print("--------------------------------") + + # Create a custom content filter + class CustomNewsFilter(RelevantContentFilter): + def __init__(self): + super().__init__() + # Add news-specific patterns + self.negative_patterns = re.compile( + r'nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending', + re.I + ) + self.min_word_count = 30 # Higher threshold for news content + + def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: + """ + Implements news-specific content filtering logic. + + Args: + html (str): HTML content to be filtered + min_word_threshold (int, optional): Minimum word count threshold + + Returns: + List[str]: List of filtered HTML content blocks + """ + if not html or not isinstance(html, str): + return [] + + soup = BeautifulSoup(html, 'lxml') + if not soup.body: + soup = BeautifulSoup(f'{html}', 'lxml') + + body = soup.find('body') + + # Extract chunks with metadata + chunks = self.extract_text_chunks(body, min_word_threshold or self.min_word_count) + + # Filter chunks based on news-specific criteria + filtered_chunks = [] + for _, text, tag_type, element in chunks: + # Skip if element has negative class/id + if self.is_excluded(element): + continue + + # Headers are important in news articles + if tag_type == 'header': + filtered_chunks.append(self.clean_element(element)) + continue + + # For content, check word count and link density + text = element.get_text(strip=True) + if len(text.split()) >= (min_word_threshold or self.min_word_count): + # Calculate link density + links_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a')) + link_density = len(links_text) / len(text) if text else 1 + + # Accept if link density is reasonable + if link_density < 0.5: + filtered_chunks.append(self.clean_element(element)) + + return filtered_chunks + + # Create markdown generator with custom filter + markdown_gen = DefaultMarkdownGenerator( + content_filter=CustomNewsFilter() + ) + + run_config = CrawlerRunConfig( + markdown_generator=markdown_gen, + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://news.ycombinator.com", + config=run_config + ) + print("Filtered Content Sample:") + print(result.markdown[:500]) # Show first 500 chars + +async def demo_json_extraction(): + """ + Improved JSON Extraction Demo + --------------------------- + + Demonstrates the enhanced JSON extraction capabilities: + 1. Base element attributes extraction + 2. Complex nested structures + 3. Multiple extraction patterns + + Key features shown: + - Extracting attributes from base elements (href, data-* attributes) + - Processing repeated patterns + - Handling optional fields + """ + print("\n3. Improved JSON Extraction Demo") + print("--------------------------------") + + # Define the extraction schema with base element attributes + json_strategy = JsonCssExtractionStrategy( + schema={ + "name": "Blog Posts", + "baseSelector": "div.article-list", + "baseFields": [ + {"name": "list_id", "type": "attribute", "attribute": "data-list-id"}, + {"name": "category", "type": "attribute", "attribute": "data-category"} + ], + "fields": [ + { + "name": "posts", + "selector": "article.post", + "type": "nested_list", + "baseFields": [ + {"name": "post_id", "type": "attribute", "attribute": "data-post-id"}, + {"name": "author_id", "type": "attribute", "attribute": "data-author"} + ], + "fields": [ + { + "name": "title", + "selector": "h2.title a", + "type": "text", + "baseFields": [ + {"name": "url", "type": "attribute", "attribute": "href"} + ] + }, + { + "name": "author", + "selector": "div.meta a.author", + "type": "text", + "baseFields": [ + {"name": "profile_url", "type": "attribute", "attribute": "href"} + ] + }, + { + "name": "date", + "selector": "span.date", + "type": "text" + }, + { + "name": "read_more", + "selector": "a.read-more", + "type": "nested", + "fields": [ + {"name": "text", "type": "text"}, + {"name": "url", "type": "attribute", "attribute": "href"} + ] + } + ] + } + ] + } + ) + + # Demonstrate extraction from raw HTML + run_config = CrawlerRunConfig( + extraction_strategy=json_strategy, + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="raw:" + SAMPLE_HTML, # Use raw: prefix for raw HTML + config=run_config + ) + print("Extracted Content:") + print(result.extracted_content) + +async def demo_input_formats(): + """ + Input Format Handling Demo + ---------------------- + + Demonstrates how LLM extraction can work with different input formats: + 1. Markdown (default) - Good for simple text extraction + 2. HTML - Better when you need structure and attributes + + This example shows how HTML input can be beneficial when: + - You need to understand the DOM structure + - You want to extract both visible text and HTML attributes + - The content has complex layouts like tables or forms + """ + print("\n4. Input Format Handling Demo") + print("---------------------------") + + # Create a dummy HTML with rich structure + dummy_html = """ +
    +
    +

    Senior AI/ML Engineer

    +
    + AI Research Division + San Francisco (Hybrid) +
    +
    + $150,000 - $220,000 + per year +
    +
    + +
    +
    +

    Technical Requirements

    +
      +
    • + 5+ years experience in Machine Learning +
    • +
    • + Proficiency in Python and PyTorch/TensorFlow +
    • +
    • + Experience with distributed training systems +
    • +
    +
    + +
    +

    Professional Skills

    +
      +
    • + Strong problem-solving abilities +
    • +
    • + Experience leading technical teams +
    • +
    +
    +
    + +
    + +
    + +
    +
    +

    Hiring Manager

    +
    + Dr. Sarah Chen + Director of AI Research + +
    +
    +
    +

    Join our team of 50+ researchers working on cutting-edge AI applications

    +
    +
    +
    + """ + + # Use raw:// prefix to pass HTML content directly + url = f"raw://{dummy_html}" + + from pydantic import BaseModel, Field + from typing import List, Optional + + # Define our schema using Pydantic + class JobRequirement(BaseModel): + category: str = Field(description="Category of the requirement (e.g., Technical, Soft Skills)") + items: List[str] = Field(description="List of specific requirements in this category") + priority: str = Field(description="Priority level (Required/Preferred) based on the HTML class or context") + + class JobPosting(BaseModel): + title: str = Field(description="Job title") + department: str = Field(description="Department or team") + location: str = Field(description="Job location, including remote options") + salary_range: Optional[str] = Field(description="Salary range if specified") + requirements: List[JobRequirement] = Field(description="Categorized job requirements") + application_deadline: Optional[str] = Field(description="Application deadline if specified") + contact_info: Optional[dict] = Field(description="Contact information from footer or contact section") + + # First try with markdown (default) + markdown_strategy = LLMExtractionStrategy( + provider="openai/gpt-4o", + api_token=os.getenv("OPENAI_API_KEY"), + schema=JobPosting.model_json_schema(), + extraction_type="schema", + instruction=""" + Extract job posting details into structured data. Focus on the visible text content + and organize requirements into categories. + """, + input_format="markdown" # default + ) + + # Then with HTML for better structure understanding + html_strategy = LLMExtractionStrategy( + provider="openai/gpt-4", + api_token=os.getenv("OPENAI_API_KEY"), + schema=JobPosting.model_json_schema(), + extraction_type="schema", + instruction=""" + Extract job posting details, using HTML structure to: + 1. Identify requirement priorities from CSS classes (e.g., 'required' vs 'preferred') + 2. Extract contact info from the page footer or dedicated contact section + 3. Parse salary information from specially formatted elements + 4. Determine application deadline from timestamp or date elements + + Use HTML attributes and classes to enhance extraction accuracy. + """, + input_format="html" # explicitly use HTML + ) + + async with AsyncWebCrawler() as crawler: + # Try with markdown first + markdown_config = CrawlerRunConfig( + extraction_strategy=markdown_strategy + ) + markdown_result = await crawler.arun( + url=url, + config=markdown_config + ) + print("\nMarkdown-based Extraction Result:") + items = json.loads(markdown_result.extracted_content) + print(json.dumps(items, indent=2)) + + # Then with HTML for better structure understanding + html_config = CrawlerRunConfig( + extraction_strategy=html_strategy + ) + html_result = await crawler.arun( + url=url, + config=html_config + ) + print("\nHTML-based Extraction Result:") + items = json.loads(html_result.extracted_content) + print(json.dumps(items, indent=2)) + +# Main execution +async def main(): + print("Crawl4AI v0.4.24 Feature Walkthrough") + print("====================================") + + # Run all demos + await demo_ssl_features() + await demo_content_filtering() + await demo_json_extraction() + # await demo_input_formats() + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/md_v2/advanced/content-processing.md b/docs/md_v2/advanced/content-processing.md new file mode 100644 index 0000000000000000000000000000000000000000..25ed6172f2451dcce4898c1eeb0cb923ed07defd --- /dev/null +++ b/docs/md_v2/advanced/content-processing.md @@ -0,0 +1,136 @@ +# Content Processing + +Crawl4AI provides powerful content processing capabilities that help you extract clean, relevant content from web pages. This guide covers content cleaning, media handling, link analysis, and metadata extraction. + +## Media Processing + +Crawl4AI provides comprehensive media extraction and analysis capabilities. It automatically detects and processes various types of media elements while maintaining their context and relevance. + +### Image Processing + +The library handles various image scenarios, including: +- Regular images +- Lazy-loaded images +- Background images +- Responsive images +- Image metadata and context + +```python +from crawl4ai.async_configs import CrawlerRunConfig + +config = CrawlerRunConfig() +result = await crawler.arun(url="https://example.com", config=config) + +for image in result.media["images"]: + # Each image includes rich metadata + print(f"Source: {image['src']}") + print(f"Alt text: {image['alt']}") + print(f"Description: {image['desc']}") + print(f"Context: {image['context']}") # Surrounding text + print(f"Relevance score: {image['score']}") # 0-10 score +``` + +### Handling Lazy-Loaded Content + +Crawl4AI already handles lazy loading for media elements. You can customize the wait time for lazy-loaded content with `CrawlerRunConfig`: + +```python +config = CrawlerRunConfig( + wait_for="css:img[data-src]", # Wait for lazy images + delay_before_return_html=2.0 # Additional wait time +) +result = await crawler.arun(url="https://example.com", config=config) +``` + +### Video and Audio Content + +The library extracts video and audio elements with their metadata: + +```python +from crawl4ai.async_configs import CrawlerRunConfig + +config = CrawlerRunConfig() +result = await crawler.arun(url="https://example.com", config=config) + +# Process videos +for video in result.media["videos"]: + print(f"Video source: {video['src']}") + print(f"Type: {video['type']}") + print(f"Duration: {video.get('duration')}") + print(f"Thumbnail: {video.get('poster')}") + +# Process audio +for audio in result.media["audios"]: + print(f"Audio source: {audio['src']}") + print(f"Type: {audio['type']}") + print(f"Duration: {audio.get('duration')}") +``` + +## Link Analysis + +Crawl4AI provides sophisticated link analysis capabilities, helping you understand the relationship between pages and identify important navigation patterns. + +### Link Classification + +The library automatically categorizes links into: +- Internal links (same domain) +- External links (different domains) +- Social media links +- Navigation links +- Content links + +```python +from crawl4ai.async_configs import CrawlerRunConfig + +config = CrawlerRunConfig() +result = await crawler.arun(url="https://example.com", config=config) + +# Analyze internal links +for link in result.links["internal"]: + print(f"Internal: {link['href']}") + print(f"Link text: {link['text']}") + print(f"Context: {link['context']}") # Surrounding text + print(f"Type: {link['type']}") # nav, content, etc. + +# Analyze external links +for link in result.links["external"]: + print(f"External: {link['href']}") + print(f"Domain: {link['domain']}") + print(f"Type: {link['type']}") +``` + +### Smart Link Filtering + +Control which links are included in the results with `CrawlerRunConfig`: + +```python +config = CrawlerRunConfig( + exclude_external_links=True, # Remove external links + exclude_social_media_links=True, # Remove social media links + exclude_social_media_domains=[ # Custom social media domains + "facebook.com", "twitter.com", "instagram.com" + ], + exclude_domains=["ads.example.com"] # Exclude specific domains +) +result = await crawler.arun(url="https://example.com", config=config) +``` + +## Metadata Extraction + +Crawl4AI automatically extracts and processes page metadata, providing valuable information about the content: + +```python +from crawl4ai.async_configs import CrawlerRunConfig + +config = CrawlerRunConfig() +result = await crawler.arun(url="https://example.com", config=config) + +metadata = result.metadata +print(f"Title: {metadata['title']}") +print(f"Description: {metadata['description']}") +print(f"Keywords: {metadata['keywords']}") +print(f"Author: {metadata['author']}") +print(f"Published Date: {metadata['published_date']}") +print(f"Modified Date: {metadata['modified_date']}") +print(f"Language: {metadata['language']}") +``` diff --git a/docs/md_v2/advanced/hooks-auth.md b/docs/md_v2/advanced/hooks-auth.md new file mode 100644 index 0000000000000000000000000000000000000000..6604222910df5d77cfbf99deaca8c54cd299e9d3 --- /dev/null +++ b/docs/md_v2/advanced/hooks-auth.md @@ -0,0 +1,121 @@ +# Hooks & Auth for AsyncWebCrawler + +Crawl4AI's `AsyncWebCrawler` allows you to customize the behavior of the web crawler using hooks. Hooks are asynchronous functions called at specific points in the crawling process, allowing you to modify the crawler's behavior or perform additional actions. This updated documentation demonstrates how to use hooks, including the new `on_page_context_created` hook, and ensures compatibility with `BrowserConfig` and `CrawlerRunConfig`. + +## Example: Using Crawler Hooks with AsyncWebCrawler + +In this example, we'll: + +1. Configure the browser and set up authentication when it's created. +2. Apply custom routing and initial actions when the page context is created. +3. Add custom headers before navigating to the URL. +4. Log the current URL after navigation. +5. Perform actions after JavaScript execution. +6. Log the length of the HTML before returning it. + +### Hook Definitions + +```python +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from playwright.async_api import Page, Browser, BrowserContext + +def log_routing(route): + # Example: block loading images + if route.request.resource_type == "image": + print(f"[HOOK] Blocking image request: {route.request.url}") + asyncio.create_task(route.abort()) + else: + asyncio.create_task(route.continue_()) + +async def on_browser_created(browser: Browser, **kwargs): + print("[HOOK] on_browser_created") + # Example: Set browser viewport size and log in + context = await browser.new_context(viewport={"width": 1920, "height": 1080}) + page = await context.new_page() + await page.goto("https://example.com/login") + await page.fill("input[name='username']", "testuser") + await page.fill("input[name='password']", "password123") + await page.click("button[type='submit']") + await page.wait_for_selector("#welcome") + await context.add_cookies([{"name": "auth_token", "value": "abc123", "url": "https://example.com"}]) + await page.close() + await context.close() + +async def on_page_context_created(context: BrowserContext, page: Page, **kwargs): + print("[HOOK] on_page_context_created") + await context.route("**", log_routing) + +async def before_goto(page: Page, context: BrowserContext, **kwargs): + print("[HOOK] before_goto") + await page.set_extra_http_headers({"X-Test-Header": "test"}) + +async def after_goto(page: Page, context: BrowserContext, **kwargs): + print("[HOOK] after_goto") + print(f"Current URL: {page.url}") + +async def on_execution_started(page: Page, context: BrowserContext, **kwargs): + print("[HOOK] on_execution_started") + await page.evaluate("console.log('Custom JS executed')") + +async def before_return_html(page: Page, context: BrowserContext, html: str, **kwargs): + print("[HOOK] before_return_html") + print(f"HTML length: {len(html)}") + return page +``` + +### Using the Hooks with AsyncWebCrawler + +```python +async def main(): + print("\n🔗 Using Crawler Hooks: Customize AsyncWebCrawler with hooks!") + + # Configure browser and crawler settings + browser_config = BrowserConfig( + headless=True, + viewport_width=1920, + viewport_height=1080 + ) + + crawler_run_config = CrawlerRunConfig( + js_code="window.scrollTo(0, document.body.scrollHeight);", + wait_for="footer" + ) + + # Initialize crawler + async with AsyncWebCrawler(config=browser_config) as crawler: + crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) + crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) + crawler.crawler_strategy.set_hook("before_goto", before_goto) + crawler.crawler_strategy.set_hook("after_goto", after_goto) + crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) + crawler.crawler_strategy.set_hook("before_return_html", before_return_html) + + # Run the crawler + result = await crawler.arun(url="https://example.com", config=crawler_run_config) + + print("\n📦 Crawler Hooks Result:") + print(result) + +asyncio.run(main()) +``` + +### Explanation of Hooks + +- **`on_browser_created`**: Called when the browser is created. Use this to configure the browser or handle authentication (e.g., logging in and setting cookies). +- **`on_page_context_created`**: Called when a new page context is created. Use this to apply routing, block resources, or inject custom logic before navigating to the URL. +- **`before_goto`**: Called before navigating to the URL. Use this to add custom headers or perform other pre-navigation actions. +- **`after_goto`**: Called after navigation. Use this to verify content or log the URL. +- **`on_execution_started`**: Called after executing custom JavaScript. Use this to perform additional actions. +- **`before_return_html`**: Called before returning the HTML content. Use this to log details or preprocess the content. + +### Additional Customizations + +- **Resource Management**: Use `on_page_context_created` to block or modify requests (e.g., block images, fonts, or third-party scripts). +- **Dynamic Headers**: Use `before_goto` to add or modify headers dynamically based on the URL. +- **Authentication**: Use `on_browser_created` to handle login processes and set authentication cookies or tokens. +- **Content Analysis**: Use `before_return_html` to analyze or modify the extracted HTML content. + +These hooks provide powerful customization options for tailoring the crawling process to your needs. + diff --git a/docs/md_v2/advanced/identity_based_crawling.md b/docs/md_v2/advanced/identity_based_crawling.md new file mode 100644 index 0000000000000000000000000000000000000000..c0ab7fd599d13c62c9c4c4a64ca6d5499966dde4 --- /dev/null +++ b/docs/md_v2/advanced/identity_based_crawling.md @@ -0,0 +1,156 @@ +### Preserve Your Identity with Crawl4AI + +Crawl4AI empowers you to navigate and interact with the web using your authentic digital identity, ensuring that you are recognized as a human and not mistaken for a bot. This document introduces Managed Browsers, the recommended approach for preserving your rights to access the web, and Magic Mode, a simplified solution for specific scenarios. + +--- + +### Managed Browsers: Your Digital Identity Solution + +**Managed Browsers** enable developers to create and use persistent browser profiles. These profiles store local storage, cookies, and other session-related data, allowing you to interact with websites as a recognized user. By leveraging your unique identity, Managed Browsers ensure that your experience reflects your rights as a human browsing the web. + +#### Why Use Managed Browsers? +1. **Authentic Browsing Experience**: Managed Browsers retain session data and browser fingerprints, mirroring genuine user behavior. +2. **Effortless Configuration**: Once you interact with the site using the browser (e.g., solving a CAPTCHA), the session data is saved and reused, providing seamless access. +3. **Empowered Data Access**: By using your identity, Managed Browsers empower users to access data they can view on their own screens without artificial restrictions. + +#### Steps to Use Managed Browsers + +1. **Setup the Browser Configuration**: + ```python + from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + + browser_config = BrowserConfig( + headless=False, # Set to False for initial setup to view browser actions + verbose=True, + user_agent_mode="random", + use_managed_browser=True, # Enables persistent browser sessions + browser_type="chromium", + user_data_dir="/path/to/user_profile_data" # Path to save session data + ) + ``` + +2. **Perform an Initial Run**: + - Run the crawler with `headless=False`. + - Manually interact with the site (e.g., solve CAPTCHA or log in). + - The browser session saves cookies, local storage, and other required data. + +3. **Subsequent Runs**: + - Switch to `headless=True` for automation. + - The session data is reused, allowing seamless crawling. + +#### Example: Extracting Data Using Managed Browsers + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +async def main(): + # Define schema for structured data extraction + schema = { + "name": "Example Data", + "baseSelector": "div.example", + "fields": [ + {"name": "title", "selector": "h1", "type": "text"}, + {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"} + ] + } + + # Configure crawler + browser_config = BrowserConfig( + headless=True, # Automate subsequent runs + verbose=True, + use_managed_browser=True, + user_data_dir="/path/to/user_profile_data" + ) + + crawl_config = CrawlerRunConfig( + extraction_strategy=JsonCssExtractionStrategy(schema), + wait_for="css:div.example" # Wait for the targeted element to load + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=crawl_config + ) + + if result.success: + print("Extracted Data:", result.extracted_content) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### Benefits of Managed Browsers Over Other Methods +Managed Browsers eliminate the need for manual detection workarounds by enabling developers to work directly with their identity and user profile data. This approach ensures maximum compatibility with websites and simplifies the crawling process while preserving your right to access data freely. + +--- + +### Magic Mode: Simplified Automation + +While Managed Browsers are the preferred approach, **Magic Mode** provides an alternative for scenarios where persistent user profiles are unnecessary or infeasible. Magic Mode automates user-like behavior and simplifies configuration. + +#### What Magic Mode Does: +- Simulates human browsing by randomizing interaction patterns and timing. +- Masks browser automation signals. +- Handles cookie popups and modals. +- Modifies navigator properties for enhanced compatibility. + +#### Using Magic Mode + +```python +async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + magic=True # Enables all automation features + ) +``` + +Magic Mode is particularly useful for: +- Quick prototyping when a Managed Browser setup is not available. +- Basic sites requiring minimal interaction or configuration. + +#### Example: Combining Magic Mode with Additional Options + +```python +async def crawl_with_magic_mode(url: str): + async with AsyncWebCrawler(headless=True) as crawler: + result = await crawler.arun( + url=url, + magic=True, + remove_overlay_elements=True, # Remove popups/modals + page_timeout=60000 # Increased timeout for complex pages + ) + + return result.markdown if result.success else None +``` + +### Magic Mode vs. Managed Browsers +While Magic Mode simplifies many tasks, it cannot match the reliability and authenticity of Managed Browsers. By using your identity and persistent profiles, Managed Browsers render Magic Mode largely unnecessary. However, Magic Mode remains a viable fallback for specific situations where user identity is not a factor. + +--- + +### Key Comparison: Managed Browsers vs. Magic Mode + +| Feature | **Managed Browsers** | **Magic Mode** | +|-------------------------|------------------------------------------|-------------------------------------| +| **Session Persistence** | Retains cookies and local storage. | No session retention. | +| **Human Interaction** | Uses real user profiles and data. | Simulates human-like patterns. | +| **Complex Sites** | Best suited for heavily configured sites.| Works well with simpler challenges.| +| **Setup Complexity** | Requires initial manual interaction. | Fully automated, one-line setup. | + +#### Recommendation: +- Use **Managed Browsers** for reliable, session-based crawling and data extraction. +- Use **Magic Mode** for quick prototyping or when persistent profiles are not required. + +--- + +### Conclusion + +- **Use Managed Browsers** to preserve your digital identity and ensure reliable, identity-based crawling with persistent sessions. This approach works seamlessly for even the most complex websites. +- **Leverage Magic Mode** for quick automation or in scenarios where persistent user profiles are not needed. + +By combining these approaches, Crawl4AI provides unparalleled flexibility and capability for your crawling needs. + diff --git a/docs/md_v2/advanced/magic-mode.md b/docs/md_v2/advanced/magic-mode.md new file mode 100644 index 0000000000000000000000000000000000000000..16c7229e787deb263af917067fefbc9d89142f5c --- /dev/null +++ b/docs/md_v2/advanced/magic-mode.md @@ -0,0 +1,52 @@ +# Magic Mode & Anti-Bot Protection + +Crawl4AI provides powerful anti-detection capabilities, with Magic Mode being the simplest and most comprehensive solution. + +## Magic Mode + +The easiest way to bypass anti-bot protections: + +```python +async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + magic=True # Enables all anti-detection features + ) +``` + +Magic Mode automatically: +- Masks browser automation signals +- Simulates human-like behavior +- Overrides navigator properties +- Handles cookie consent popups +- Manages browser fingerprinting +- Randomizes timing patterns + +## Manual Anti-Bot Options + +While Magic Mode is recommended, you can also configure individual anti-detection features: + +```python +result = await crawler.arun( + url="https://example.com", + simulate_user=True, # Simulate human behavior + override_navigator=True # Mask automation signals +) +``` + +Note: When `magic=True` is used, you don't need to set these individual options. + +## Example: Handling Protected Sites + +```python +async def crawl_protected_site(url: str): + async with AsyncWebCrawler(headless=True) as crawler: + result = await crawler.arun( + url=url, + magic=True, + remove_overlay_elements=True, # Remove popups/modals + page_timeout=60000 # Increased timeout for protection checks + ) + + return result.markdown if result.success else None +``` diff --git a/docs/md_v2/advanced/managed_browser.md b/docs/md_v2/advanced/managed_browser.md new file mode 100644 index 0000000000000000000000000000000000000000..bbe07f2f8deff46e1722a3b002ded08b046a9a6c --- /dev/null +++ b/docs/md_v2/advanced/managed_browser.md @@ -0,0 +1,188 @@ +# Creating Browser Instances, Contexts, and Pages + +## 1 Introduction + +### Overview of Browser Management in Crawl4AI +Crawl4AI's browser management system is designed to provide developers with advanced tools for handling complex web crawling tasks. By managing browser instances, contexts, and pages, Crawl4AI ensures optimal performance, anti-bot measures, and session persistence for high-volume, dynamic web crawling. + +### Key Objectives +- **Anti-Bot Handling**: + - Implements stealth techniques to evade detection mechanisms used by modern websites. + - Simulates human-like behavior, such as mouse movements, scrolling, and key presses. + - Supports integration with third-party services to bypass CAPTCHA challenges. +- **Persistent Sessions**: + - Retains session data (cookies, local storage) for workflows requiring user authentication. + - Allows seamless continuation of tasks across multiple runs without re-authentication. +- **Scalable Crawling**: + - Optimized resource utilization for handling thousands of URLs concurrently. + - Flexible configuration options to tailor crawling behavior to specific requirements. + +--- + +## 2 Browser Creation Methods + +### Standard Browser Creation +Standard browser creation initializes a browser instance with default or minimal configurations. It is suitable for tasks that do not require session persistence or heavy customization. + +#### Features and Limitations +- **Features**: + - Quick and straightforward setup for small-scale tasks. + - Supports headless and headful modes. +- **Limitations**: + - Lacks advanced customization options like session reuse. + - May struggle with sites employing strict anti-bot measures. + +#### Example Usage +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig + +browser_config = BrowserConfig(browser_type="chromium", headless=True) +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://crawl4ai.com") + print(result.markdown) +``` + +### Persistent Contexts +Persistent contexts create browser sessions with stored data, enabling workflows that require maintaining login states or other session-specific information. + +#### Benefits of Using `user_data_dir` +- **Session Persistence**: + - Stores cookies, local storage, and cache between crawling sessions. + - Reduces overhead for repetitive logins or multi-step workflows. +- **Enhanced Performance**: + - Leverages pre-loaded resources for faster page loading. +- **Flexibility**: + - Adapts to complex workflows requiring user-specific configurations. + +#### Example: Setting Up Persistent Contexts +```python +config = BrowserConfig(user_data_dir="/path/to/user/data") +async with AsyncWebCrawler(config=config) as crawler: + result = await crawler.arun("https://crawl4ai.com") + print(result.markdown) +``` + +### Managed Browser +The `ManagedBrowser` class offers a high-level abstraction for managing browser instances, emphasizing resource management, debugging capabilities, and anti-bot measures. + +#### How It Works +- **Browser Process Management**: + - Automates initialization and cleanup of browser processes. + - Optimizes resource usage by pooling and reusing browser instances. +- **Debugging Support**: + - Integrates with debugging tools like Chrome Developer Tools for real-time inspection. +- **Anti-Bot Measures**: + - Implements stealth plugins to mimic real user behavior and bypass bot detection. + +#### Features +- **Customizable Configurations**: + - Supports advanced options such as viewport resizing, proxy settings, and header manipulation. +- **Debugging and Logging**: + - Logs detailed browser interactions for debugging and performance analysis. +- **Scalability**: + - Handles multiple browser instances concurrently, scaling dynamically based on workload. + +#### Example: Using `ManagedBrowser` +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig + +config = BrowserConfig(headless=False, debug_port=9222) +async with AsyncWebCrawler(config=config) as crawler: + result = await crawler.arun("https://crawl4ai.com") + print(result.markdown) +``` + +--- + +## 3 Context and Page Management + +### Creating and Configuring Browser Contexts +Browser contexts act as isolated environments within a single browser instance, enabling independent browsing sessions with their own cookies, cache, and storage. + +#### Customizations +- **Headers and Cookies**: + - Define custom headers to mimic specific devices or browsers. + - Set cookies for authenticated sessions. +- **Session Reuse**: + - Retain and reuse session data across multiple requests. + - Example: Preserve login states for authenticated crawls. + +#### Example: Context Initialization +```python +from crawl4ai import CrawlerRunConfig + +config = CrawlerRunConfig(headers={"User-Agent": "Crawl4AI/1.0"}) +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://crawl4ai.com", config=config) + print(result.markdown) +``` + +### Creating Pages +Pages represent individual tabs or views within a browser context. They are responsible for rendering content, executing JavaScript, and handling user interactions. + +#### Key Features +- **IFrame Handling**: + - Extract content from embedded iframes. + - Navigate and interact with nested content. +- **Viewport Customization**: + - Adjust viewport size to match target device dimensions. +- **Lazy Loading**: + - Ensure dynamic elements are fully loaded before extraction. + +#### Example: Page Initialization +```python +config = CrawlerRunConfig(viewport_width=1920, viewport_height=1080) +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://crawl4ai.com", config=config) + print(result.markdown) +``` + +--- + +## 4 Advanced Features and Best Practices + +### Debugging and Logging +Remote debugging provides a powerful way to troubleshoot complex crawling workflows. + +#### Example: Enabling Remote Debugging +```python +config = BrowserConfig(debug_port=9222) +async with AsyncWebCrawler(config=config) as crawler: + result = await crawler.arun("https://crawl4ai.com") +``` + +### Anti-Bot Techniques +- **Human Behavior Simulation**: + - Mimic real user actions, such as scrolling, clicking, and typing. + - Example: Use JavaScript to simulate interactions. +- **Captcha Handling**: + - Integrate with third-party services like 2Captcha or AntiCaptcha for automated solving. + +#### Example: Simulating User Actions +```python +js_code = """ +(async () => { + document.querySelector('input[name="search"]').value = 'test'; + document.querySelector('button[type="submit"]').click(); +})(); +""" +config = CrawlerRunConfig(js_code=[js_code]) +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://crawl4ai.com", config=config) +``` + +### Optimizations for Performance and Scalability +- **Persistent Contexts**: + - Reuse browser contexts to minimize resource consumption. +- **Concurrent Crawls**: + - Use `arun_many` with a controlled semaphore count for efficient batch processing. + +#### Example: Scaling Crawls +```python +urls = ["https://example1.com", "https://example2.com"] +config = CrawlerRunConfig(semaphore_count=10) +async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many(urls, config=config) + for result in results: + print(result.url, result.markdown) +``` diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md new file mode 100644 index 0000000000000000000000000000000000000000..8989777b0b07e40a8c3c744dd53dd89919c57b6b --- /dev/null +++ b/docs/md_v2/advanced/proxy-security.md @@ -0,0 +1,95 @@ +# Proxy & Security + +Configure proxy settings and enhance security features in Crawl4AI for reliable data extraction. + +## Basic Proxy Setup + +Simple proxy configuration with `BrowserConfig`: + +```python +from crawl4ai.async_configs import BrowserConfig + +# Using proxy URL +browser_config = BrowserConfig(proxy="http://proxy.example.com:8080") +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com") + +# Using SOCKS proxy +browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080") +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com") +``` + +## Authenticated Proxy + +Use an authenticated proxy with `BrowserConfig`: + +```python +from crawl4ai.async_configs import BrowserConfig + +proxy_config = { + "server": "http://proxy.example.com:8080", + "username": "user", + "password": "pass" +} + +browser_config = BrowserConfig(proxy_config=proxy_config) +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com") +``` + +## Rotating Proxies + +Example using a proxy rotation service and updating `BrowserConfig` dynamically: + +```python +from crawl4ai.async_configs import BrowserConfig + +async def get_next_proxy(): + # Your proxy rotation logic here + return {"server": "http://next.proxy.com:8080"} + +browser_config = BrowserConfig() +async with AsyncWebCrawler(config=browser_config) as crawler: + # Update proxy for each request + for url in urls: + proxy = await get_next_proxy() + browser_config.proxy_config = proxy + result = await crawler.arun(url=url, config=browser_config) +``` + +## Custom Headers + +Add security-related headers via `BrowserConfig`: + +```python +from crawl4ai.async_configs import BrowserConfig + +headers = { + "X-Forwarded-For": "203.0.113.195", + "Accept-Language": "en-US,en;q=0.9", + "Cache-Control": "no-cache", + "Pragma": "no-cache" +} + +browser_config = BrowserConfig(headers=headers) +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com") +``` + +## Combining with Magic Mode + +For maximum protection, combine proxy with Magic Mode via `CrawlerRunConfig` and `BrowserConfig`: + +```python +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig + +browser_config = BrowserConfig( + proxy="http://proxy.example.com:8080", + headers={"Accept-Language": "en-US"} +) +crawler_config = CrawlerRunConfig(magic=True) # Enable all anti-detection features + +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com", config=crawler_config) +``` diff --git a/docs/md_v2/advanced/session-management-advanced.md b/docs/md_v2/advanced/session-management-advanced.md new file mode 100644 index 0000000000000000000000000000000000000000..ba1ae0a0bfc0ef52f82eb6e915496b31c4b3b1b5 --- /dev/null +++ b/docs/md_v2/advanced/session-management-advanced.md @@ -0,0 +1,179 @@ +### Session-Based Crawling for Dynamic Content + +In modern web applications, content is often loaded dynamically without changing the URL. Examples include "Load More" buttons, infinite scrolling, or paginated content that updates via JavaScript. Crawl4AI provides session-based crawling capabilities to handle such scenarios effectively. + +This guide explores advanced techniques for crawling dynamic content using Crawl4AI's session management features. + +--- + +## Understanding Session-Based Crawling + +Session-based crawling allows you to reuse a persistent browser session across multiple actions. This means the same browser tab (or page object) is used throughout, enabling: + +1. **Efficient handling of dynamic content** without reloading the page. +2. **JavaScript actions before and after crawling** (e.g., clicking buttons or scrolling). +3. **State maintenance** for authenticated sessions or multi-step workflows. +4. **Faster sequential crawling**, as it avoids reopening tabs or reallocating resources. + +**Note:** Session-based crawling is ideal for sequential operations, not parallel tasks. + +--- + +## Basic Concepts + +Before diving into examples, here are some key concepts: + +- **Session ID**: A unique identifier for a browsing session. Use the same `session_id` across multiple requests to maintain state. +- **BrowserConfig & CrawlerRunConfig**: These configuration objects control browser settings and crawling behavior. +- **JavaScript Execution**: Use `js_code` to perform actions like clicking buttons. +- **CSS Selectors**: Target specific elements for interaction or data extraction. +- **Extraction Strategy**: Define rules to extract structured data. +- **Wait Conditions**: Specify conditions to wait for before proceeding. + +--- + +## Example 1: Basic Session-Based Crawling + +A simple example using session-based crawling: + +```python +import asyncio +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.cache_context import CacheMode + +async def basic_session_crawl(): + async with AsyncWebCrawler() as crawler: + session_id = "dynamic_content_session" + url = "https://example.com/dynamic-content" + + for page in range(3): + config = CrawlerRunConfig( + url=url, + session_id=session_id, + js_code="document.querySelector('.load-more-button').click();" if page > 0 else None, + css_selector=".content-item", + cache_mode=CacheMode.BYPASS + ) + + result = await crawler.arun(config=config) + print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items") + + await crawler.crawler_strategy.kill_session(session_id) + +asyncio.run(basic_session_crawl()) +``` + +This example shows: +1. Reusing the same `session_id` across multiple requests. +2. Executing JavaScript to load more content dynamically. +3. Properly closing the session to free resources. + +--- + +## Advanced Technique 1: Custom Execution Hooks + +Use custom hooks to handle complex scenarios, such as waiting for content to load dynamically: + +```python +async def advanced_session_crawl_with_hooks(): + first_commit = "" + + async def on_execution_started(page): + nonlocal first_commit + try: + while True: + await page.wait_for_selector("li.commit-item h4") + commit = await page.query_selector("li.commit-item h4") + commit = await commit.evaluate("(element) => element.textContent").strip() + if commit and commit != first_commit: + first_commit = commit + break + await asyncio.sleep(0.5) + except Exception as e: + print(f"Warning: New content didn't appear: {e}") + + async with AsyncWebCrawler() as crawler: + session_id = "commit_session" + url = "https://github.com/example/repo/commits/main" + crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) + + js_next_page = """document.querySelector('a.pagination-next').click();""" + + for page in range(3): + config = CrawlerRunConfig( + url=url, + session_id=session_id, + js_code=js_next_page if page > 0 else None, + css_selector="li.commit-item", + js_only=page > 0, + cache_mode=CacheMode.BYPASS + ) + + result = await crawler.arun(config=config) + print(f"Page {page + 1}: Found {len(result.extracted_content)} commits") + + await crawler.crawler_strategy.kill_session(session_id) + +asyncio.run(advanced_session_crawl_with_hooks()) +``` + +This technique ensures new content loads before the next action. + +--- + +## Advanced Technique 2: Integrated JavaScript Execution and Waiting + +Combine JavaScript execution and waiting logic for concise handling of dynamic content: + +```python +async def integrated_js_and_wait_crawl(): + async with AsyncWebCrawler() as crawler: + session_id = "integrated_session" + url = "https://github.com/example/repo/commits/main" + + js_next_page_and_wait = """ + (async () => { + const getCurrentCommit = () => document.querySelector('li.commit-item h4').textContent.trim(); + const initialCommit = getCurrentCommit(); + document.querySelector('a.pagination-next').click(); + while (getCurrentCommit() === initialCommit) { + await new Promise(resolve => setTimeout(resolve, 100)); + } + })(); + """ + + for page in range(3): + config = CrawlerRunConfig( + url=url, + session_id=session_id, + js_code=js_next_page_and_wait if page > 0 else None, + css_selector="li.commit-item", + js_only=page > 0, + cache_mode=CacheMode.BYPASS + ) + + result = await crawler.arun(config=config) + print(f"Page {page + 1}: Found {len(result.extracted_content)} commits") + + await crawler.crawler_strategy.kill_session(session_id) + +asyncio.run(integrated_js_and_wait_crawl()) +``` + +--- + +## Best Practices for Session-Based Crawling + +1. **Unique Session IDs**: Assign descriptive and unique `session_id` values. +2. **Close Sessions**: Always clean up sessions with `kill_session` after use. +3. **Error Handling**: Anticipate and handle errors gracefully. +4. **Respect Websites**: Follow terms of service and robots.txt. +5. **Delays**: Add delays to avoid overwhelming servers. +6. **Optimize JavaScript**: Keep scripts concise for better performance. +7. **Monitor Resources**: Track memory and CPU usage for long sessions. + +--- + +## Conclusion + +Session-based crawling in Crawl4AI is a robust solution for handling dynamic content and multi-step workflows. By combining session management, JavaScript execution, and structured extraction strategies, you can effectively navigate and extract data from modern web applications. Always adhere to ethical web scraping practices and respect website policies. \ No newline at end of file diff --git a/docs/md_v2/advanced/session-management.md b/docs/md_v2/advanced/session-management.md new file mode 100644 index 0000000000000000000000000000000000000000..e93482236f9191c5b1df58adeafc8812095c0afb --- /dev/null +++ b/docs/md_v2/advanced/session-management.md @@ -0,0 +1,137 @@ +### Session Management + +Session management in Crawl4AI is a powerful feature that allows you to maintain state across multiple requests, making it particularly suitable for handling complex multi-step crawling tasks. It enables you to reuse the same browser tab (or page object) across sequential actions and crawls, which is beneficial for: + +- **Performing JavaScript actions before and after crawling.** +- **Executing multiple sequential crawls faster** without needing to reopen tabs or allocate memory repeatedly. + +**Note:** This feature is designed for sequential workflows and is not suitable for parallel operations. + +--- + +#### Basic Session Usage + +Use `BrowserConfig` and `CrawlerRunConfig` to maintain state with a `session_id`: + +```python +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig + +async with AsyncWebCrawler() as crawler: + session_id = "my_session" + + # Define configurations + config1 = CrawlerRunConfig(url="https://example.com/page1", session_id=session_id) + config2 = CrawlerRunConfig(url="https://example.com/page2", session_id=session_id) + + # First request + result1 = await crawler.arun(config=config1) + + # Subsequent request using the same session + result2 = await crawler.arun(config=config2) + + # Clean up when done + await crawler.crawler_strategy.kill_session(session_id) +``` + +--- + +#### Dynamic Content with Sessions + +Here's an example of crawling GitHub commits across multiple pages while preserving session state: + +```python +from crawl4ai.async_configs import CrawlerRunConfig +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.cache_context import CacheMode + +async def crawl_dynamic_content(): + async with AsyncWebCrawler() as crawler: + session_id = "github_commits_session" + url = "https://github.com/microsoft/TypeScript/commits/main" + all_commits = [] + + # Define extraction schema + schema = { + "name": "Commit Extractor", + "baseSelector": "li.Box-sc-g0xbh4-0", + "fields": [{"name": "title", "selector": "h4.markdown-title", "type": "text"}], + } + extraction_strategy = JsonCssExtractionStrategy(schema) + + # JavaScript and wait configurations + js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();""" + wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0""" + + # Crawl multiple pages + for page in range(3): + config = CrawlerRunConfig( + url=url, + session_id=session_id, + extraction_strategy=extraction_strategy, + js_code=js_next_page if page > 0 else None, + wait_for=wait_for if page > 0 else None, + js_only=page > 0, + cache_mode=CacheMode.BYPASS + ) + + result = await crawler.arun(config=config) + if result.success: + commits = json.loads(result.extracted_content) + all_commits.extend(commits) + print(f"Page {page + 1}: Found {len(commits)} commits") + + # Clean up session + await crawler.crawler_strategy.kill_session(session_id) + return all_commits +``` + +--- + +#### Session Best Practices + +1. **Descriptive Session IDs**: + Use meaningful names for session IDs to organize workflows: + ```python + session_id = "login_flow_session" + session_id = "product_catalog_session" + ``` + +2. **Resource Management**: + Always ensure sessions are cleaned up to free resources: + ```python + try: + # Your crawling code here + pass + finally: + await crawler.crawler_strategy.kill_session(session_id) + ``` + +3. **State Maintenance**: + Reuse the session for subsequent actions within the same workflow: + ```python + # Step 1: Login + login_config = CrawlerRunConfig( + url="https://example.com/login", + session_id=session_id, + js_code="document.querySelector('form').submit();" + ) + await crawler.arun(config=login_config) + + # Step 2: Verify login success + dashboard_config = CrawlerRunConfig( + url="https://example.com/dashboard", + session_id=session_id, + wait_for="css:.user-profile" # Wait for authenticated content + ) + result = await crawler.arun(config=dashboard_config) + ``` + +--- + +#### Common Use Cases for Sessions + +1. **Authentication Flows**: Login and interact with secured pages. +2. **Pagination Handling**: Navigate through multiple pages. +3. **Form Submissions**: Fill forms, submit, and process results. +4. **Multi-step Processes**: Complete workflows that span multiple actions. +5. **Dynamic Content Navigation**: Handle JavaScript-rendered or event-triggered content. diff --git a/docs/md_v2/api/arun.md b/docs/md_v2/api/arun.md new file mode 100644 index 0000000000000000000000000000000000000000..509991e583ecb8b4c0e85255de93caa319b56865 --- /dev/null +++ b/docs/md_v2/api/arun.md @@ -0,0 +1,244 @@ +# Complete Parameter Guide for arun() + +The following parameters can be passed to the `arun()` method. They are organized by their primary usage context and functionality. + +## Core Parameters + +```python +await crawler.arun( + url="https://example.com", # Required: URL to crawl + verbose=True, # Enable detailed logging + cache_mode=CacheMode.ENABLED, # Control cache behavior + warmup=True # Whether to run warmup check +) +``` + +## Cache Control + +```python +from crawl4ai import CacheMode + +await crawler.arun( + cache_mode=CacheMode.ENABLED, # Normal caching (read/write) + # Other cache modes: + # cache_mode=CacheMode.DISABLED # No caching at all + # cache_mode=CacheMode.READ_ONLY # Only read from cache + # cache_mode=CacheMode.WRITE_ONLY # Only write to cache + # cache_mode=CacheMode.BYPASS # Skip cache for this operation +) +``` + +## Content Processing Parameters + +### Text Processing +```python +await crawler.arun( + word_count_threshold=10, # Minimum words per content block + image_description_min_word_threshold=5, # Minimum words for image descriptions + only_text=False, # Extract only text content + excluded_tags=['form', 'nav'], # HTML tags to exclude + keep_data_attributes=False, # Preserve data-* attributes +) +``` + +### Content Selection +```python +await crawler.arun( + css_selector=".main-content", # CSS selector for content extraction + remove_forms=True, # Remove all form elements + remove_overlay_elements=True, # Remove popups/modals/overlays +) +``` + +### Link Handling +```python +await crawler.arun( + exclude_external_links=True, # Remove external links + exclude_social_media_links=True, # Remove social media links + exclude_external_images=True, # Remove external images + exclude_domains=["ads.example.com"], # Specific domains to exclude + social_media_domains=[ # Additional social media domains + "facebook.com", + "twitter.com", + "instagram.com" + ] +) +``` + +## Browser Control Parameters + +### Basic Browser Settings +```python +await crawler.arun( + headless=True, # Run browser in headless mode + browser_type="chromium", # Browser engine: "chromium", "firefox", "webkit" + page_timeout=60000, # Page load timeout in milliseconds + user_agent="custom-agent", # Custom user agent +) +``` + +### Navigation and Waiting +```python +await crawler.arun( + wait_for="css:.dynamic-content", # Wait for element/condition + delay_before_return_html=2.0, # Wait before returning HTML (seconds) +) +``` + +### JavaScript Execution +```python +await crawler.arun( + js_code=[ # JavaScript to execute (string or list) + "window.scrollTo(0, document.body.scrollHeight);", + "document.querySelector('.load-more').click();" + ], + js_only=False, # Only execute JavaScript without reloading page +) +``` + +### Anti-Bot Features +```python +await crawler.arun( + magic=True, # Enable all anti-detection features + simulate_user=True, # Simulate human behavior + override_navigator=True # Override navigator properties +) +``` + +### Session Management +```python +await crawler.arun( + session_id="my_session", # Session identifier for persistent browsing +) +``` + +### Screenshot Options +```python +await crawler.arun( + screenshot=True, # Take page screenshot + screenshot_wait_for=2.0, # Wait before screenshot (seconds) +) +``` + +### Proxy Configuration +```python +await crawler.arun( + proxy="http://proxy.example.com:8080", # Simple proxy URL + proxy_config={ # Advanced proxy settings + "server": "http://proxy.example.com:8080", + "username": "user", + "password": "pass" + } +) +``` + +## Content Extraction Parameters + +### Extraction Strategy +```python +await crawler.arun( + extraction_strategy=LLMExtractionStrategy( + provider="ollama/llama2", + schema=MySchema.schema(), + instruction="Extract specific data" + ) +) +``` + +### Chunking Strategy +```python +await crawler.arun( + chunking_strategy=RegexChunking( + patterns=[r'\n\n', r'\.\s+'] + ) +) +``` + +### HTML to Text Options +```python +await crawler.arun( + html2text={ + "ignore_links": False, + "ignore_images": False, + "escape_dot": False, + "body_width": 0, + "protect_links": True, + "unicode_snob": True + } +) +``` + +## Debug Options +```python +await crawler.arun( + log_console=True, # Log browser console messages +) +``` + +## Parameter Interactions and Notes + +1. **Cache and Performance Setup** + ```python + # Optimal caching for repeated crawls + await crawler.arun( + cache_mode=CacheMode.ENABLED, + word_count_threshold=10, + process_iframes=False + ) + ``` + +2. **Dynamic Content Handling** + ```python + # Handle lazy-loaded content + await crawler.arun( + js_code="window.scrollTo(0, document.body.scrollHeight);", + wait_for="css:.lazy-content", + delay_before_return_html=2.0, + cache_mode=CacheMode.WRITE_ONLY # Cache results after dynamic load + ) + ``` + +3. **Content Extraction Pipeline** + ```python + # Complete extraction setup + await crawler.arun( + css_selector=".main-content", + word_count_threshold=20, + extraction_strategy=my_strategy, + chunking_strategy=my_chunking, + process_iframes=True, + remove_overlay_elements=True, + cache_mode=CacheMode.ENABLED + ) + ``` + +## Best Practices + +1. **Performance Optimization** + ```python + await crawler.arun( + cache_mode=CacheMode.ENABLED, # Use full caching + word_count_threshold=10, # Filter out noise + process_iframes=False # Skip iframes if not needed + ) + ``` + +2. **Reliable Scraping** + ```python + await crawler.arun( + magic=True, # Enable anti-detection + delay_before_return_html=1.0, # Wait for dynamic content + page_timeout=60000, # Longer timeout for slow pages + cache_mode=CacheMode.WRITE_ONLY # Cache results after successful crawl + ) + ``` + +3. **Clean Content** + ```python + await crawler.arun( + remove_overlay_elements=True, # Remove popups + excluded_tags=['nav', 'aside'],# Remove unnecessary elements + keep_data_attributes=False, # Remove data attributes + cache_mode=CacheMode.ENABLED # Use cache for faster processing + ) + ``` \ No newline at end of file diff --git a/docs/md_v2/api/async-webcrawler.md b/docs/md_v2/api/async-webcrawler.md new file mode 100644 index 0000000000000000000000000000000000000000..be95610153899a7eff50e2d133feb1c816dfcb70 --- /dev/null +++ b/docs/md_v2/api/async-webcrawler.md @@ -0,0 +1,320 @@ +# AsyncWebCrawler + +The `AsyncWebCrawler` class is the main interface for web crawling operations. It provides asynchronous web crawling capabilities with extensive configuration options. + +## Constructor + +```python +AsyncWebCrawler( + # Browser Settings + browser_type: str = "chromium", # Options: "chromium", "firefox", "webkit" + headless: bool = True, # Run browser in headless mode + verbose: bool = False, # Enable verbose logging + + # Cache Settings + always_by_pass_cache: bool = False, # Always bypass cache + base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), # Base directory for cache + + # Network Settings + proxy: str = None, # Simple proxy URL + proxy_config: Dict = None, # Advanced proxy configuration + + # Browser Behavior + sleep_on_close: bool = False, # Wait before closing browser + + # Custom Settings + user_agent: str = None, # Custom user agent + headers: Dict[str, str] = {}, # Custom HTTP headers + js_code: Union[str, List[str]] = None, # Default JavaScript to execute +) +``` + +### Parameters in Detail + +#### Browser Settings + +- **browser_type** (str, optional) + - Default: `"chromium"` + - Options: `"chromium"`, `"firefox"`, `"webkit"` + - Controls which browser engine to use + ```python + # Example: Using Firefox + crawler = AsyncWebCrawler(browser_type="firefox") + ``` + +- **headless** (bool, optional) + - Default: `True` + - When `True`, browser runs without GUI + - Set to `False` for debugging + ```python + # Visible browser for debugging + crawler = AsyncWebCrawler(headless=False) + ``` + +- **verbose** (bool, optional) + - Default: `False` + - Enables detailed logging + ```python + # Enable detailed logging + crawler = AsyncWebCrawler(verbose=True) + ``` + +#### Cache Settings + +- **always_by_pass_cache** (bool, optional) + - Default: `False` + - When `True`, always fetches fresh content + ```python + # Always fetch fresh content + crawler = AsyncWebCrawler(always_by_pass_cache=True) + ``` + +- **base_directory** (str, optional) + - Default: User's home directory + - Base path for cache storage + ```python + # Custom cache directory + crawler = AsyncWebCrawler(base_directory="/path/to/cache") + ``` + +#### Network Settings + +- **proxy** (str, optional) + - Simple proxy URL + ```python + # Using simple proxy + crawler = AsyncWebCrawler(proxy="http://proxy.example.com:8080") + ``` + +- **proxy_config** (Dict, optional) + - Advanced proxy configuration with authentication + ```python + # Advanced proxy with auth + crawler = AsyncWebCrawler(proxy_config={ + "server": "http://proxy.example.com:8080", + "username": "user", + "password": "pass" + }) + ``` + +#### Browser Behavior + +- **sleep_on_close** (bool, optional) + - Default: `False` + - Adds delay before closing browser + ```python + # Wait before closing + crawler = AsyncWebCrawler(sleep_on_close=True) + ``` + +#### Custom Settings + +- **user_agent** (str, optional) + - Custom user agent string + ```python + # Custom user agent + crawler = AsyncWebCrawler( + user_agent="Mozilla/5.0 (Custom Agent) Chrome/90.0" + ) + ``` + +- **headers** (Dict[str, str], optional) + - Custom HTTP headers + ```python + # Custom headers + crawler = AsyncWebCrawler( + headers={ + "Accept-Language": "en-US", + "Custom-Header": "Value" + } + ) + ``` + +- **js_code** (Union[str, List[str]], optional) + - Default JavaScript to execute on each page + ```python + # Default JavaScript + crawler = AsyncWebCrawler( + js_code=[ + "window.scrollTo(0, document.body.scrollHeight);", + "document.querySelector('.load-more').click();" + ] + ) + ``` + +## Methods + +### arun() + +The primary method for crawling web pages. + +```python +async def arun( + # Required + url: str, # URL to crawl + + # Content Selection + css_selector: str = None, # CSS selector for content + word_count_threshold: int = 10, # Minimum words per block + + # Cache Control + bypass_cache: bool = False, # Bypass cache for this request + + # Session Management + session_id: str = None, # Session identifier + + # Screenshot Options + screenshot: bool = False, # Take screenshot + screenshot_wait_for: float = None, # Wait before screenshot + + # Content Processing + process_iframes: bool = False, # Process iframe content + remove_overlay_elements: bool = False, # Remove popups/modals + + # Anti-Bot Settings + simulate_user: bool = False, # Simulate human behavior + override_navigator: bool = False, # Override navigator properties + magic: bool = False, # Enable all anti-detection + + # Content Filtering + excluded_tags: List[str] = None, # HTML tags to exclude + exclude_external_links: bool = False, # Remove external links + exclude_social_media_links: bool = False, # Remove social media links + + # JavaScript Handling + js_code: Union[str, List[str]] = None, # JavaScript to execute + wait_for: str = None, # Wait condition + + # Page Loading + page_timeout: int = 60000, # Page load timeout (ms) + delay_before_return_html: float = None, # Wait before return + + # Extraction + extraction_strategy: ExtractionStrategy = None # Extraction strategy +) -> CrawlResult: +``` + +### Usage Examples + +#### Basic Crawling +```python +async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com") +``` + +#### Advanced Crawling +```python +async with AsyncWebCrawler( + browser_type="firefox", + verbose=True, + headers={"Custom-Header": "Value"} +) as crawler: + result = await crawler.arun( + url="https://example.com", + css_selector=".main-content", + word_count_threshold=20, + process_iframes=True, + magic=True, + wait_for="css:.dynamic-content", + screenshot=True + ) +``` + +#### Session Management +```python +async with AsyncWebCrawler() as crawler: + # First request + result1 = await crawler.arun( + url="https://example.com/login", + session_id="my_session" + ) + + # Subsequent request using same session + result2 = await crawler.arun( + url="https://example.com/protected", + session_id="my_session" + ) +``` + +## Context Manager + +AsyncWebCrawler implements the async context manager protocol: + +```python +async def __aenter__(self) -> 'AsyncWebCrawler': + # Initialize browser and resources + return self + +async def __aexit__(self, *args): + # Cleanup resources + pass +``` + +Always use AsyncWebCrawler with async context manager: +```python +async with AsyncWebCrawler() as crawler: + # Your crawling code here + pass +``` + +## Best Practices + +1. **Resource Management** +```python +# Always use context manager +async with AsyncWebCrawler() as crawler: + # Crawler will be properly cleaned up + pass +``` + +2. **Error Handling** +```python +try: + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com") + if not result.success: + print(f"Crawl failed: {result.error_message}") +except Exception as e: + print(f"Error: {str(e)}") +``` + +3. **Performance Optimization** +```python +# Enable caching for better performance +crawler = AsyncWebCrawler( + always_by_pass_cache=False, + verbose=True +) +``` + +4. **Anti-Detection** +```python +# Maximum stealth +crawler = AsyncWebCrawler( + headless=True, + user_agent="Mozilla/5.0...", + headers={"Accept-Language": "en-US"} +) +result = await crawler.arun( + url="https://example.com", + magic=True, + simulate_user=True +) +``` + +## Note on Browser Types + +Each browser type has its characteristics: + +- **chromium**: Best overall compatibility +- **firefox**: Good for specific use cases +- **webkit**: Lighter weight, good for basic crawling + +Choose based on your specific needs: +```python +# High compatibility +crawler = AsyncWebCrawler(browser_type="chromium") + +# Memory efficient +crawler = AsyncWebCrawler(browser_type="webkit") +``` \ No newline at end of file diff --git a/docs/md_v2/api/crawl-config.md b/docs/md_v2/api/crawl-config.md new file mode 100644 index 0000000000000000000000000000000000000000..928ae1e2f23b5d6a563fdfe253eb81c3057319c0 --- /dev/null +++ b/docs/md_v2/api/crawl-config.md @@ -0,0 +1,85 @@ +# CrawlerRunConfig Parameters Documentation + +## Content Processing Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `word_count_threshold` | int | 200 | Minimum word count threshold before processing content | +| `extraction_strategy` | ExtractionStrategy | None | Strategy to extract structured data from crawled pages. When None, uses NoExtractionStrategy | +| `chunking_strategy` | ChunkingStrategy | RegexChunking() | Strategy to chunk content before extraction | +| `markdown_generator` | MarkdownGenerationStrategy | None | Strategy for generating markdown from extracted content | +| `content_filter` | RelevantContentFilter | None | Optional filter to prune irrelevant content | +| `only_text` | bool | False | If True, attempt to extract text-only content where applicable | +| `css_selector` | str | None | CSS selector to extract a specific portion of the page | +| `excluded_tags` | list[str] | [] | List of HTML tags to exclude from processing | +| `keep_data_attributes` | bool | False | If True, retain `data-*` attributes while removing unwanted attributes | +| `remove_forms` | bool | False | If True, remove all `` elements from the HTML | +| `prettiify` | bool | False | If True, apply `fast_format_html` to produce prettified HTML output | + +## Caching Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `cache_mode` | CacheMode | None | Defines how caching is handled. Defaults to CacheMode.ENABLED internally | +| `session_id` | str | None | Optional session ID to persist browser context and page instance | +| `bypass_cache` | bool | False | Legacy parameter, if True acts like CacheMode.BYPASS | +| `disable_cache` | bool | False | Legacy parameter, if True acts like CacheMode.DISABLED | +| `no_cache_read` | bool | False | Legacy parameter, if True acts like CacheMode.WRITE_ONLY | +| `no_cache_write` | bool | False | Legacy parameter, if True acts like CacheMode.READ_ONLY | + +## Page Navigation and Timing Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `wait_until` | str | "domcontentloaded" | The condition to wait for when navigating | +| `page_timeout` | int | 60000 | Timeout in milliseconds for page operations like navigation | +| `wait_for` | str | None | CSS selector or JS condition to wait for before extracting content | +| `wait_for_images` | bool | True | If True, wait for images to load before extracting content | +| `delay_before_return_html` | float | 0.1 | Delay in seconds before retrieving final HTML | +| `mean_delay` | float | 0.1 | Mean base delay between requests when calling arun_many | +| `max_range` | float | 0.3 | Max random additional delay range for requests in arun_many | +| `semaphore_count` | int | 5 | Number of concurrent operations allowed | + +## Page Interaction Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `js_code` | str or list[str] | None | JavaScript code/snippets to run on the page | +| `js_only` | bool | False | If True, indicates subsequent calls are JS-driven updates | +| `ignore_body_visibility` | bool | True | If True, ignore whether the body is visible before proceeding | +| `scan_full_page` | bool | False | If True, scroll through the entire page to load all content | +| `scroll_delay` | float | 0.2 | Delay in seconds between scroll steps if scan_full_page is True | +| `process_iframes` | bool | False | If True, attempts to process and inline iframe content | +| `remove_overlay_elements` | bool | False | If True, remove overlays/popups before extracting HTML | +| `simulate_user` | bool | False | If True, simulate user interactions for anti-bot measures | +| `override_navigator` | bool | False | If True, overrides navigator properties for more human-like behavior | +| `magic` | bool | False | If True, attempts automatic handling of overlays/popups | +| `adjust_viewport_to_content` | bool | False | If True, adjust viewport according to page content dimensions | + +## Media Handling Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `screenshot` | bool | False | Whether to take a screenshot after crawling | +| `screenshot_wait_for` | float | None | Additional wait time before taking a screenshot | +| `screenshot_height_threshold` | int | 20000 | Threshold for page height to decide screenshot strategy | +| `pdf` | bool | False | Whether to generate a PDF of the page | +| `image_description_min_word_threshold` | int | 50 | Minimum words for image description extraction | +| `image_score_threshold` | int | 3 | Minimum score threshold for processing an image | +| `exclude_external_images` | bool | False | If True, exclude all external images from processing | + +## Link and Domain Handling Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `exclude_social_media_domains` | list[str] | SOCIAL_MEDIA_DOMAINS | List of domains to exclude for social media links | +| `exclude_external_links` | bool | False | If True, exclude all external links from the results | +| `exclude_social_media_links` | bool | False | If True, exclude links pointing to social media domains | +| `exclude_domains` | list[str] | [] | List of specific domains to exclude from results | + +## Debugging and Logging Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `verbose` | bool | True | Enable verbose logging | +| `log_console` | bool | False | If True, log console messages from the page | \ No newline at end of file diff --git a/docs/md_v2/api/crawl-result.md b/docs/md_v2/api/crawl-result.md new file mode 100644 index 0000000000000000000000000000000000000000..7e3bda98476d360095dcc5aa5812afa379cef9fe --- /dev/null +++ b/docs/md_v2/api/crawl-result.md @@ -0,0 +1,302 @@ +# CrawlResult + +The `CrawlResult` class represents the result of a web crawling operation. It provides access to various forms of extracted content and metadata from the crawled webpage. + +## Class Definition + +```python +class CrawlResult(BaseModel): + """Result of a web crawling operation.""" + + # Basic Information + url: str # Crawled URL + success: bool # Whether crawl succeeded + status_code: Optional[int] = None # HTTP status code + error_message: Optional[str] = None # Error message if failed + + # Content + html: str # Raw HTML content + cleaned_html: Optional[str] = None # Cleaned HTML + fit_html: Optional[str] = None # Most relevant HTML content + markdown: Optional[str] = None # HTML converted to markdown + fit_markdown: Optional[str] = None # Most relevant markdown content + downloaded_files: Optional[List[str]] = None # Downloaded files + + # Extracted Data + extracted_content: Optional[str] = None # Content from extraction strategy + media: Dict[str, List[Dict]] = {} # Extracted media information + links: Dict[str, List[Dict]] = {} # Extracted links + metadata: Optional[dict] = None # Page metadata + + # Additional Data + screenshot: Optional[str] = None # Base64 encoded screenshot + session_id: Optional[str] = None # Session identifier + response_headers: Optional[dict] = None # HTTP response headers +``` + +## Properties and Their Data Structures + +### Basic Information + +```python +# Access basic information +result = await crawler.arun(url="https://example.com") + +print(result.url) # "https://example.com" +print(result.success) # True/False +print(result.status_code) # 200, 404, etc. +print(result.error_message) # Error details if failed +``` + +### Content Properties + +#### HTML Content +```python +# Raw HTML +html_content = result.html + +# Cleaned HTML (removed ads, popups, etc.) +clean_content = result.cleaned_html + +# Most relevant HTML content +main_content = result.fit_html +``` + +#### Markdown Content +```python +# Full markdown version +markdown_content = result.markdown + +# Most relevant markdown content +main_content = result.fit_markdown +``` + +### Media Content + +The media dictionary contains organized media elements: + +```python +# Structure +media = { + "images": [ + { + "src": str, # Image URL + "alt": str, # Alt text + "desc": str, # Contextual description + "score": float, # Relevance score (0-10) + "type": str, # "image" + "width": int, # Image width (if available) + "height": int, # Image height (if available) + "context": str, # Surrounding text + "lazy": bool # Whether image was lazy-loaded + } + ], + "videos": [ + { + "src": str, # Video URL + "type": str, # "video" + "title": str, # Video title + "poster": str, # Thumbnail URL + "duration": str, # Video duration + "description": str # Video description + } + ], + "audios": [ + { + "src": str, # Audio URL + "type": str, # "audio" + "title": str, # Audio title + "duration": str, # Audio duration + "description": str # Audio description + } + ] +} + +# Example usage +for image in result.media["images"]: + if image["score"] > 5: # High-relevance images + print(f"High-quality image: {image['src']}") + print(f"Context: {image['context']}") +``` + +### Link Analysis + +The links dictionary organizes discovered links: + +```python +# Structure +links = { + "internal": [ + { + "href": str, # URL + "text": str, # Link text + "title": str, # Title attribute + "type": str, # Link type (nav, content, etc.) + "context": str, # Surrounding text + "score": float # Relevance score + } + ], + "external": [ + { + "href": str, # External URL + "text": str, # Link text + "title": str, # Title attribute + "domain": str, # Domain name + "type": str, # Link type + "context": str # Surrounding text + } + ] +} + +# Example usage +for link in result.links["internal"]: + print(f"Internal link: {link['href']}") + print(f"Context: {link['context']}") +``` + +### Metadata + +The metadata dictionary contains page information: + +```python +# Structure +metadata = { + "title": str, # Page title + "description": str, # Meta description + "keywords": List[str], # Meta keywords + "author": str, # Author information + "published_date": str, # Publication date + "modified_date": str, # Last modified date + "language": str, # Page language + "canonical_url": str, # Canonical URL + "og_data": Dict, # Open Graph data + "twitter_data": Dict # Twitter card data +} + +# Example usage +if result.metadata: + print(f"Title: {result.metadata['title']}") + print(f"Author: {result.metadata.get('author', 'Unknown')}") +``` + +### Extracted Content + +Content from extraction strategies: + +```python +# For LLM or CSS extraction strategies +if result.extracted_content: + structured_data = json.loads(result.extracted_content) + print(structured_data) +``` + +### Screenshot + +Base64 encoded screenshot: + +```python +# Save screenshot if available +if result.screenshot: + import base64 + + # Decode and save + with open("screenshot.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) +``` + +## Usage Examples + +### Basic Content Access +```python +async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com") + + if result.success: + # Get clean content + print(result.fit_markdown) + + # Process images + for image in result.media["images"]: + if image["score"] > 7: + print(f"High-quality image: {image['src']}") +``` + +### Complete Data Processing +```python +async def process_webpage(url: str) -> Dict: + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url=url) + + if not result.success: + raise Exception(f"Crawl failed: {result.error_message}") + + return { + "content": result.fit_markdown, + "images": [ + img for img in result.media["images"] + if img["score"] > 5 + ], + "internal_links": [ + link["href"] for link in result.links["internal"] + ], + "metadata": result.metadata, + "status": result.status_code + } +``` + +### Error Handling +```python +async def safe_crawl(url: str) -> Dict: + async with AsyncWebCrawler() as crawler: + try: + result = await crawler.arun(url=url) + + if not result.success: + return { + "success": False, + "error": result.error_message, + "status": result.status_code + } + + return { + "success": True, + "content": result.fit_markdown, + "status": result.status_code + } + + except Exception as e: + return { + "success": False, + "error": str(e), + "status": None + } +``` + +## Best Practices + +1. **Always Check Success** +```python +if not result.success: + print(f"Error: {result.error_message}") + return +``` + +2. **Use fit_markdown for Articles** +```python +# Better for article content +content = result.fit_markdown if result.fit_markdown else result.markdown +``` + +3. **Filter Media by Score** +```python +relevant_images = [ + img for img in result.media["images"] + if img["score"] > 5 +] +``` + +4. **Handle Missing Data** +```python +metadata = result.metadata or {} +title = metadata.get('title', 'Unknown Title') +``` \ No newline at end of file diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md new file mode 100644 index 0000000000000000000000000000000000000000..c1c4d2ea1be06321ee87f6ffb6ffde3db3ad086b --- /dev/null +++ b/docs/md_v2/api/parameters.md @@ -0,0 +1,36 @@ +# Parameter Reference Table + +| File Name | Parameter Name | Code Usage | Strategy/Class | Description | +|-----------|---------------|------------|----------------|-------------| +| async_crawler_strategy.py | user_agent | `kwargs.get("user_agent")` | AsyncPlaywrightCrawlerStrategy | User agent string for browser identification | +| async_crawler_strategy.py | proxy | `kwargs.get("proxy")` | AsyncPlaywrightCrawlerStrategy | Proxy server configuration for network requests | +| async_crawler_strategy.py | proxy_config | `kwargs.get("proxy_config")` | AsyncPlaywrightCrawlerStrategy | Detailed proxy configuration including auth | +| async_crawler_strategy.py | headless | `kwargs.get("headless", True)` | AsyncPlaywrightCrawlerStrategy | Whether to run browser in headless mode | +| async_crawler_strategy.py | browser_type | `kwargs.get("browser_type", "chromium")` | AsyncPlaywrightCrawlerStrategy | Type of browser to use (chromium/firefox/webkit) | +| async_crawler_strategy.py | headers | `kwargs.get("headers", {})` | AsyncPlaywrightCrawlerStrategy | Custom HTTP headers for requests | +| async_crawler_strategy.py | verbose | `kwargs.get("verbose", False)` | AsyncPlaywrightCrawlerStrategy | Enable detailed logging output | +| async_crawler_strategy.py | sleep_on_close | `kwargs.get("sleep_on_close", False)` | AsyncPlaywrightCrawlerStrategy | Add delay before closing browser | +| async_crawler_strategy.py | use_managed_browser | `kwargs.get("use_managed_browser", False)` | AsyncPlaywrightCrawlerStrategy | Use managed browser instance | +| async_crawler_strategy.py | user_data_dir | `kwargs.get("user_data_dir", None)` | AsyncPlaywrightCrawlerStrategy | Custom directory for browser profile data | +| async_crawler_strategy.py | session_id | `kwargs.get("session_id")` | AsyncPlaywrightCrawlerStrategy | Unique identifier for browser session | +| async_crawler_strategy.py | override_navigator | `kwargs.get("override_navigator", False)` | AsyncPlaywrightCrawlerStrategy | Override browser navigator properties | +| async_crawler_strategy.py | simulate_user | `kwargs.get("simulate_user", False)` | AsyncPlaywrightCrawlerStrategy | Simulate human-like behavior | +| async_crawler_strategy.py | magic | `kwargs.get("magic", False)` | AsyncPlaywrightCrawlerStrategy | Enable advanced anti-detection features | +| async_crawler_strategy.py | log_console | `kwargs.get("log_console", False)` | AsyncPlaywrightCrawlerStrategy | Log browser console messages | +| async_crawler_strategy.py | js_only | `kwargs.get("js_only", False)` | AsyncPlaywrightCrawlerStrategy | Only execute JavaScript without page load | +| async_crawler_strategy.py | page_timeout | `kwargs.get("page_timeout", 60000)` | AsyncPlaywrightCrawlerStrategy | Timeout for page load in milliseconds | +| async_crawler_strategy.py | ignore_body_visibility | `kwargs.get("ignore_body_visibility", True)` | AsyncPlaywrightCrawlerStrategy | Process page even if body is hidden | +| async_crawler_strategy.py | js_code | `kwargs.get("js_code", kwargs.get("js", self.js_code))` | AsyncPlaywrightCrawlerStrategy | Custom JavaScript code to execute | +| async_crawler_strategy.py | wait_for | `kwargs.get("wait_for")` | AsyncPlaywrightCrawlerStrategy | Wait for specific element/condition | +| async_crawler_strategy.py | process_iframes | `kwargs.get("process_iframes", False)` | AsyncPlaywrightCrawlerStrategy | Extract content from iframes | +| async_crawler_strategy.py | delay_before_return_html | `kwargs.get("delay_before_return_html")` | AsyncPlaywrightCrawlerStrategy | Additional delay before returning HTML | +| async_crawler_strategy.py | remove_overlay_elements | `kwargs.get("remove_overlay_elements", False)` | AsyncPlaywrightCrawlerStrategy | Remove pop-ups and overlay elements | +| async_crawler_strategy.py | screenshot | `kwargs.get("screenshot")` | AsyncPlaywrightCrawlerStrategy | Take page screenshot | +| async_crawler_strategy.py | screenshot_wait_for | `kwargs.get("screenshot_wait_for")` | AsyncPlaywrightCrawlerStrategy | Wait before taking screenshot | +| async_crawler_strategy.py | semaphore_count | `kwargs.get("semaphore_count", 5)` | AsyncPlaywrightCrawlerStrategy | Concurrent request limit | +| async_webcrawler.py | verbose | `kwargs.get("verbose", False)` | AsyncWebCrawler | Enable detailed logging | +| async_webcrawler.py | warmup | `kwargs.get("warmup", True)` | AsyncWebCrawler | Initialize crawler with warmup request | +| async_webcrawler.py | session_id | `kwargs.get("session_id", None)` | AsyncWebCrawler | Session identifier for browser reuse | +| async_webcrawler.py | only_text | `kwargs.get("only_text", False)` | AsyncWebCrawler | Extract only text content | +| async_webcrawler.py | bypass_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl | +| async_webcrawler.py | cache_mode | `kwargs.get("cache_mode", CacheMode.ENABLE)` | AsyncWebCrawler | Cache handling mode for request | \ No newline at end of file diff --git a/docs/md_v2/api/strategies.md b/docs/md_v2/api/strategies.md new file mode 100644 index 0000000000000000000000000000000000000000..f0f8f57cb18cba1d48d076e6b46c1048d04a5d89 --- /dev/null +++ b/docs/md_v2/api/strategies.md @@ -0,0 +1,255 @@ +# Extraction & Chunking Strategies API + +This documentation covers the API reference for extraction and chunking strategies in Crawl4AI. + +## Extraction Strategies + +All extraction strategies inherit from the base `ExtractionStrategy` class and implement two key methods: +- `extract(url: str, html: str) -> List[Dict[str, Any]]` +- `run(url: str, sections: List[str]) -> List[Dict[str, Any]]` + +### LLMExtractionStrategy + +Used for extracting structured data using Language Models. + +```python +LLMExtractionStrategy( + # Required Parameters + provider: str = DEFAULT_PROVIDER, # LLM provider (e.g., "ollama/llama2") + api_token: Optional[str] = None, # API token + + # Extraction Configuration + instruction: str = None, # Custom extraction instruction + schema: Dict = None, # Pydantic model schema for structured data + extraction_type: str = "block", # "block" or "schema" + + # Chunking Parameters + chunk_token_threshold: int = 4000, # Maximum tokens per chunk + overlap_rate: float = 0.1, # Overlap between chunks + word_token_rate: float = 0.75, # Word to token conversion rate + apply_chunking: bool = True, # Enable/disable chunking + + # API Configuration + base_url: str = None, # Base URL for API + extra_args: Dict = {}, # Additional provider arguments + verbose: bool = False # Enable verbose logging +) +``` + +### CosineStrategy + +Used for content similarity-based extraction and clustering. + +```python +CosineStrategy( + # Content Filtering + semantic_filter: str = None, # Topic/keyword filter + word_count_threshold: int = 10, # Minimum words per cluster + sim_threshold: float = 0.3, # Similarity threshold + + # Clustering Parameters + max_dist: float = 0.2, # Maximum cluster distance + linkage_method: str = 'ward', # Clustering method + top_k: int = 3, # Top clusters to return + + # Model Configuration + model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', # Embedding model + + verbose: bool = False # Enable verbose logging +) +``` + +### JsonCssExtractionStrategy + +Used for CSS selector-based structured data extraction. + +```python +JsonCssExtractionStrategy( + schema: Dict[str, Any], # Extraction schema + verbose: bool = False # Enable verbose logging +) + +# Schema Structure +schema = { + "name": str, # Schema name + "baseSelector": str, # Base CSS selector + "fields": [ # List of fields to extract + { + "name": str, # Field name + "selector": str, # CSS selector + "type": str, # Field type: "text", "attribute", "html", "regex" + "attribute": str, # For type="attribute" + "pattern": str, # For type="regex" + "transform": str, # Optional: "lowercase", "uppercase", "strip" + "default": Any # Default value if extraction fails + } + ] +} +``` + +## Chunking Strategies + +All chunking strategies inherit from `ChunkingStrategy` and implement the `chunk(text: str) -> list` method. + +### RegexChunking + +Splits text based on regex patterns. + +```python +RegexChunking( + patterns: List[str] = None # Regex patterns for splitting + # Default: [r'\n\n'] +) +``` + +### SlidingWindowChunking + +Creates overlapping chunks with a sliding window approach. + +```python +SlidingWindowChunking( + window_size: int = 100, # Window size in words + step: int = 50 # Step size between windows +) +``` + +### OverlappingWindowChunking + +Creates chunks with specified overlap. + +```python +OverlappingWindowChunking( + window_size: int = 1000, # Chunk size in words + overlap: int = 100 # Overlap size in words +) +``` + +## Usage Examples + +### LLM Extraction + +```python +from pydantic import BaseModel +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +# Define schema +class Article(BaseModel): + title: str + content: str + author: str + +# Create strategy +strategy = LLMExtractionStrategy( + provider="ollama/llama2", + schema=Article.schema(), + instruction="Extract article details" +) + +# Use with crawler +result = await crawler.arun( + url="https://example.com/article", + extraction_strategy=strategy +) + +# Access extracted data +data = json.loads(result.extracted_content) +``` + +### CSS Extraction + +```python +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +# Define schema +schema = { + "name": "Product List", + "baseSelector": ".product-card", + "fields": [ + { + "name": "title", + "selector": "h2.title", + "type": "text" + }, + { + "name": "price", + "selector": ".price", + "type": "text", + "transform": "strip" + }, + { + "name": "image", + "selector": "img", + "type": "attribute", + "attribute": "src" + } + ] +} + +# Create and use strategy +strategy = JsonCssExtractionStrategy(schema) +result = await crawler.arun( + url="https://example.com/products", + extraction_strategy=strategy +) +``` + +### Content Chunking + +```python +from crawl4ai.chunking_strategy import OverlappingWindowChunking + +# Create chunking strategy +chunker = OverlappingWindowChunking( + window_size=500, # 500 words per chunk + overlap=50 # 50 words overlap +) + +# Use with extraction strategy +strategy = LLMExtractionStrategy( + provider="ollama/llama2", + chunking_strategy=chunker +) + +result = await crawler.arun( + url="https://example.com/long-article", + extraction_strategy=strategy +) +``` + +## Best Practices + +1. **Choose the Right Strategy** + - Use `LLMExtractionStrategy` for complex, unstructured content + - Use `JsonCssExtractionStrategy` for well-structured HTML + - Use `CosineStrategy` for content similarity and clustering + +2. **Optimize Chunking** + ```python + # For long documents + strategy = LLMExtractionStrategy( + chunk_token_threshold=2000, # Smaller chunks + overlap_rate=0.1 # 10% overlap + ) + ``` + +3. **Handle Errors** + ```python + try: + result = await crawler.arun( + url="https://example.com", + extraction_strategy=strategy + ) + if result.success: + content = json.loads(result.extracted_content) + except Exception as e: + print(f"Extraction failed: {e}") + ``` + +4. **Monitor Performance** + ```python + strategy = CosineStrategy( + verbose=True, # Enable logging + word_count_threshold=20, # Filter short content + top_k=5 # Limit results + ) + ``` \ No newline at end of file diff --git a/docs/md_v2/assets/DankMono-Bold.woff2 b/docs/md_v2/assets/DankMono-Bold.woff2 new file mode 100644 index 0000000000000000000000000000000000000000..3072fd8567c7f38769e8fa161b92417f2630f902 Binary files /dev/null and b/docs/md_v2/assets/DankMono-Bold.woff2 differ diff --git a/docs/md_v2/assets/DankMono-Italic.woff2 b/docs/md_v2/assets/DankMono-Italic.woff2 new file mode 100644 index 0000000000000000000000000000000000000000..1d01ea6d73be14be9ac9014865475ee73c597fb5 Binary files /dev/null and b/docs/md_v2/assets/DankMono-Italic.woff2 differ diff --git a/docs/md_v2/assets/DankMono-Regular.woff2 b/docs/md_v2/assets/DankMono-Regular.woff2 new file mode 100644 index 0000000000000000000000000000000000000000..99c1425ce4f3f324c29c0d4ac465bf73ffa363e9 Binary files /dev/null and b/docs/md_v2/assets/DankMono-Regular.woff2 differ diff --git a/docs/md_v2/assets/Monaco.woff b/docs/md_v2/assets/Monaco.woff new file mode 100644 index 0000000000000000000000000000000000000000..e468c424971227557730c31799863d7cdf9edf7e Binary files /dev/null and b/docs/md_v2/assets/Monaco.woff differ diff --git a/docs/md_v2/assets/dmvendor.css b/docs/md_v2/assets/dmvendor.css new file mode 100644 index 0000000000000000000000000000000000000000..0f72703d2618ba9d3b64bdc7db601344b4a17cec --- /dev/null +++ b/docs/md_v2/assets/dmvendor.css @@ -0,0 +1,127 @@ +/*! + * @preserve + * Dank Mono (v1.000) + * This font is subject to its EULA. https://dank.sh + * © 2018–2020 Phil Plückthun. All Rights Reserved. + */ + +@font-face{ + font-family: "dm"; + font-weight: normal; + font-style: normal; + unicode-range: U+0000-007F; + src: local('☺'), + url(data:font/woff2;charset=utf-8;base64,d09GMk9UVE8AAB+IAAwAAAAALhwAAB86AAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAADc1uGigbIByEAAZgAEQBNgIkA4N8BAYFgXQHIBtJLVGUcl4UgJ8HmZuXZtEW1XZl1rY1cWX7wvl7NY0jDnGEqlmVEZLM+g+Xe+//m9NaFMHk+OwEWgs4kDD0hNcYoLl1G7sFvSC2G3ejJxZRSrYORERq0KMGrIBXQjBIi6oBI5QqC7ByVsTBz+Ahf/fuPoLQbUIXbIhSU7UmVOsCIQpFLKyeN+d/7VvOOXrpvv9031nbu/K9CoRoiySCCEkIssgKDiJEJ5yEg8A4JBb8jSMYPogQokhBgbQIkOAKcVF6IMtfyIQ+d+cu/v+v/au9+3MAUeHyCCrGRZiZMzN577y77uPAywswTSaA3OomsjiqIQJVRNUqEqaqvqveEAorKjzrNFiBKWCmykph97vHcK4frYg5UKbvwyBcApjAkAdx/PSgLH7W8ncvIoOz9HysxsIcAAuLLDjpgq6hIAdK8AkoHaz+BSs1z6FKKHu36QGAe4OBjc9cfBmfqx7I7kVZJDIME8Ag0mDAhr8IGYrt1mvaRTzCZ+IxdFrRlesp5FbWsodzvMjH/Cx4LCTW4iwBUiEtIpvwUzMKc9/LM04oSiKaVlxhxnbjrchITYmLjxNw41PaiKIdvIL0JNHOIlwTRvYoRZjbNW6WNTvc9ksSJFdlg03MxCA8yR1sg+NgD8eEvwqhRVwmmh1otz0PL/9nINpnC+xkSsRGcbADHLAs7+xueTAcrEx4M8nVDKINaFcahkTCPu7ZgBjrzc9djROALdc83jmklZkzBbirzTAwKZu4Y2KGvfNW8JLydJbg3cpihXmagPp7RbETopa0E9PXxaN633U5GQnpv7FBrchc9XG/8K/zIX7xseZklgd4AgDA2EEA0wA8gDtiM6DXYcobQBEnBLF6xQiIHP+i+EeI+AUrlma5pRQNVL626ZBe1FcGx9hiXDXNzRzznGVpca37tq9dZB+3XzgE4hJnu0skebut7ivPyPPz+N5R76cK0+f4RX6L/zLQDCKCuuC7WmR4LnwbMSLHqCO6Ef3WsIqD47S4LO6JD8bXEkfNZs2HZC+yjPySEk2RUe5RmVR/ajr1FI1M20C7Dh6Qz8oseVA2anf1Zfl2+Kqe6fPMtGcadcWzK561n2vz3IrnLj4XZrzu/8Ufhr4L7AnUxTwf83rM1RjSa/H86OeXGH8xuhqTjZ+N/5rPmJPMX836uL/H++JTtZhwpqZpEAYrF4550P9ISSlBKJfiH0NYPIWJpfSPvUcectIq8eREZ+fkhLgzIUEkToA1RzqnJ5cCz8fvHlNnw1PFy9PzQX3njy0DqOnPd6Msoc+OW8wQTXfmY9SCqoGuh1Bd0h+t0Xq2hXB5levz31SEs3LOiY3LMsAzNuzPvVBXvgXVrKqkF9fTNqKhrHs3+JYChwjWqje4d+pfymdRNUvmsPHZiwCBfWdoGLa8t6vkttG3p9d8ThxmFQnQllReh7EfOv3PELLeWPBtVQWEIXHsq2HJzMYTJ56ddmrgTocfow4nZwkifdyzG2H8BJS5QHmGQSfEsMVRsowDROT00cKoqJ5zMN7J3zUj026eQRtUtyzce+DmUphqRI9N0w+c9mvzzsDvOJqvGhSzqwSmoaXTlw1eNeGMSlccaSyYoiROEyPVSIOsKCy+c2fk2aSktFF2SQXDKsxFZ+T4nJ5LF1Z2zx/LE/j7S3I2IHRKxWP/NGi5Nxx2mWev7cVorlbJxV3Xzpjr3ujN2Jd6c4suKUE7b2fHVPsh+riCj7LyuLnJdOIgi3W//+G+Q4wLO4bkp/NzuD3w6ROscPLwAecAEXr3j311ScL5TmbrEa+4O/hJGwRSBs2AKezl6Fcj0jdC3btXVJ3ZNn/1EHI47WLv6mZRxOOVWa5syKrOQzYhHjsGaxu8mMBBCbdjKB4w2Eb6cZAcyHwAGVCw/+h4ldigQsaxlj4yWDpU4O63n6ZI6RSpAnJPV6gQTCMemwDCAeZXcU+TChA8ADUgB3B6Sx/peYzw5KavyOE5XbkUzFh6NlaotbIo6/fKJ+zCXQagwercG+cJmsH0PNgIPbvfzgKFBdFOThmJsnozDZ2WB5OWF3R0SqwkcwvmsLTIEubJZE4ruUbW6PKgA2QvMogp898Fd699Ae0TfsY6/bzdm2u3wDmjU7xKC4kaGi2P6sIb6ULJCuJaLsiXhXXk8+SBk1442VOimryq23ko3cinLgVTVFTYla35Y7cu9pyvw7c673kFSSoCjlxgAUuAcEUBComAQ2Se/9cHJGcqMtCFt3L/bkFMP0I1Lp/NOV586YYf6dfBfAAnuH8kdiseWz62zsfRTyH9FToJk2d0DGfBWtCRenDU9kJFyeWdn+Sb9k0S3TvklrLgLb5cWdndyfhfcnYiksSMQZ5xdh9FaUHLoVgelWl7VaZCJZHJjMxInwyMiRMlklxJXi69vaqxXETAC74Iwlo4+KH8wkXSjuv9auIrxrnJ7Eb+yZUXA1X5yV6IlluKkyGMlXqUqMaLYMdE5q8+t/QwVHsqveMLw8crQPPVlBk8lK+MTEickKkAenHXXnnJTJsMdjpUkg0wgOX3LDDuv1w6YDNyeefIecsP3RZ3SKt3hzfSsmEtqFJIWAYTXIrGD1ufbaNMooPr31OZLfTZKAO+jPCWWBXs3tMBJ5IJeXHsUlzWntMudz3TavGABpQ8eDUau7+jpsFL6codq+Y6zwUenR5G6qQ2qq+Bqe8cES47nmmOGQumhK5PvxkAqxH9O6yxss2fivhBWaLEfGQVdjLIQy6Tv+YymZZbjsVrEJZ5qm18rfBWK5EkgSDHP+JhCYuH39ERy1r/Tit6pr4Tx4OPCRjcrofKnkPFm/rhY8Vs7aTAZg5EzqRC2rTGRPtorT9C/VrE9+zRomX/WwDNa+LEtlu7rwe0oCONhlPdM6WOVKYoW0E2FEr2tIkm/HVcP4FpmCEMzsgwLKGp5RZUwPEElTx43wtd9b9MPNrDmmGQxswM4wF+MgEwAHTo198CfDWjUfGvbYa1ffZerR+kbhcIJDBpGKkYe1wrefrI/Sd9BoC3jjiM8TcbNmNlu6E7PxvoUStkysT17plw700Tj13xXzs6fPny4TQ3WMstA++aUir3lFTgq+2FHL2y8vL2yWEkQ5BBtzakOuXluhY+nD4UfGfv+blnAnCFCrSwxoRAwLHVI0UHskhIIDuyrG8DY2OFCpT/X9KDQLO/GDfLNcI50C7l2iP0LzdvNZjabyCjhkVrcHMlSrMJ24YUcs/KFGYrblrZMXWW879Am9QFD0jK1gsSFDJRKAqV7LhLzxGH54iddZ9pWGVBx/KoPhUlXX3+ZTJT+nwx6QBMh0D+OgBL9VbOSsZziqXnFLfDSVFQwmqF5iSUebCrBnq2VwnUJ9lnUlpwHNx+6NRIBd5fw3/fN3btlHFq37F93xiA2p0jPQJbT07hw2ez2Ontd9eeDzQembHKpG3uXDyIgMV9secFW2SXVSDW9hZhhyS7PIqBXRfj8glGzOuvwxIGfqV2CmrQA/5sF1pljBR7ZmW2REMEpHQkfTItjX8ysbV5aD8y01ZpAnGdoEbOlJSqCHJxBsq/0Ppz8uTNPZ0FiV0w5YlZ7woKeVDvV4ueIJT9W9U9GVKFQYo+H4Y+Pfs8shl2cX1+EWUzm1O2I2WkwikuImVodCC5hOvZfyocRqlcC9qYs5Fre2psyMu9uOsDu/4+jBqyQu23mKfWdR287gfUvYDsDfKISDnEiipBOJfZplXaWAOTzDDH7SbIrRHAH33Dvucr0xUqppZrOYjlOV7nVeXwkSPnrx6XHCDiYOAQhfiroEPp0sWMIe3I/fXgf98ipdLYANkSgdypigobFS/CDVSQ79SvQLSgrb5Ve2wbTyq8cLal+obWAppnX6R4ih7DijNRjgsSfR2BoVyDgiJS1HDY3YYO2ly4UBixsd32AGt7gfUpH5u+YQembZrGdnyzXR+uBZV+UkrsKhqqtr4Tqj6b67NxETFxbkyh5VpIDylsxB7Qd/gmqSkFDeCk26srYUhzGADEIOcGVEEv3JPjc8K4dJLZA/sO2wfIn1TJjtpoewPpwxu2j9Eo+JhaUK1ICQUX8NG3bDzIoy655XerdLwhhGhLb2oLnk9E7+rEJLl4eqDwSUJzIkIeTNhe4H3KmxdNp00DEnIJKTCN0MhNaCrP9uJRroTx/avAmOpgzn4SrksvvHCHeWo6z7TTwPWzsXD6cHDsvs/THuWQfxiClZga21vjs9HBt+xBx+gMA5XTT/Uv3FHGjEBOx10cU7gkoQVDwcWI6CdP9Gw1H5L0z6mSuJ4XkapQoSk6jN9Qm6PhcgGMKn02DH0ORug79ytIR4hwjPBRd/kuqwD0Q6BHRTC2I20nMojZkTZOdplYmoiNE1BmVKeIlGtBBxOEVXSVjbO1+s4qn+1gOM5GMKyW67MdaZL7RtBZ9YYlci233DEp8RxvwEJD/G+MZyPaFBXojocZeZB7M2JTN1U9UDSFl7moXJLqyHqkYlvxE5SDUBRLOyhltBkFh3GRJCKeOq64vHWOVEakNi7WCARCrRCrYNr6PrFNMnOpEnMwE49Oj23VJi1TQTADIk7tn3YN6PXe40pNkuQOsbDkPYPo/BsgwF5tgwWRP3DrGw0LtMKzb4HZZu3fNxDWntaLDmte/YAAvxbcBs/K6H0n/WCdL2zHwpcvr6pqRX6YW1tQU9AXo3v6hTPtJvsb+N7Rb/OhGV/+vK12e93m7oLuwi7ig76CCyD5Y7F1BhVx/QM7JqcBnMp0sSo0Gvn1Xz7Q03AjV8LsyU1Yb2o8SKaeTXrrhh8r85cWHP0F62kXd+cFzyOnTzA9r2hi593tAqTfqiOpE/nPVVjyjvkw/8SkJH/rgekKXPKodfMpcFo6qKcNGX899Qnx+SLkih6jtp5XWg4gkRiJsGMUpHM9O+TuZr8L73CDmHIbD2xvOZIkMe62hz3s79SAYajLSe550Tw1a3nbAflfYHU+SWh5OZL0s+1Rd70Y9NkctRQJMgMmRCAKc6CS2L7S30HYRx+0leEDSveV832PZ8Fc9bgj7ggx98RFlXN99gmE7F+gIsGfxMn+A3o6mYyMkXB87+bcG1GdeW7vt3sMp9VR02D0EZ7C3p/E9Z2UsPc39TCFGFowKFR6L59lSfBcJlWgy+o47HE8RMdXD+IpXUgFTJbIyZHOtMUZxodBDDw1dccdSXds3BFoV1FxKDeTFikbiyR+Ani03vos10nAFa0M4WVkzOGPAWa5eclKVVAADTBa3QtETm7UypKFRlGWc9byILnGi2RHKp/y0GSRi3MWLpYQxHF1b1edGaminroW7P4dGXZhVsXvh16FUZQM86gpGS6Rs+SSLDQlD2KeLFySNRwUWAAhC1bDiYNGlsQ1igp51irU62iStnTRFIVyzTFDmgeh6PMqP8dMbjquyIfty31AbW6JJ/N6IZ2eK60r22b7kbfnZZQUwk/QUg6zOIC0rEvPGSPWwceGg4XFJ1fMQQ+u9iTA3CEW5kiqciDKIUIa4eMaxoBiEgJbGbI2oG4rOysrOuEXxdCXHV9PhVLH7D9tanPLB11SnBsI26J7Id618drW5911emWk9wnhyx+3Nu9oIerHZk0en1MYgPkj9N8Ige8LjxSJPcbP5sQiGBQRYhNUiys7z++9Fmi81JVoxOzbOqFr+oaTs01t7i0VZHmUGD6yz9zQxNUBAsZcAR3so7UAQmemDu4o/Mq6/0aLIsIfiFMXgsmDAu81hS3o9VeAXRXIQi/nLNQkXjqyY5dwTFLII8W/YV58Jm9kCtbnZZcUmv0mEkmLGY2Lc9njex0DZP5CPPma2INkebVo0sBRhKqFfmx0dniWfSChbz2YoF2saEQ0lJci5IbHuuu1UC7JbCEAwmJJ4M7N7WdPm+mFbPjQ9XnhUIuUsscVpjLjC3+rbG5mwGIyEAolp8RYCjRQhQzZOD9CgV1oLvgApCscpO8P3AAY0FBWpqA7HohtL8Qoky5H2F+CKgySMpKgRdgn958+eFgQLINLt4H3s3vt7g81a92L6AtMaiNML6GRbxksy5XatY/nUAorkiJveSEefFg29YSD8YWiVZPzMCmLMq1Z8ZdcNUriYK7Xn5o0OoUu1fIXm5OFWbFA69Hl8oZQ8ZTUUZh8xOhZ/qm+UPxA+zGi5eUslNtrQNT2nqrVy2tJ5DBsZS0Orx0fkwcaznZrbtK1+tsBYZOAwUiYksnA3rNe4sN6PXoLym4eNeMrqj8iR1VGkcMIIH3BXHfyTiEgybGSMiXJmPuMaIple6t7yXb7y6ZcC0DFqgfBVBc/lvUPbbtIGpfNiYmUnXrHQLsT48oQQSX3AB/qg0OeWL8araNdDLqcbuIiJlEvFL7XVspjyl9omLS96qBvQ7eZfyRR7C6FoAWVeNMtj2os98IO9eQVThxc3dE5AO1dncxFXPtyZSh+DtZV1MHn1si6+qHdRsL95JaLB6LS+LQc4Ro+fOyDpKEQaMpbm2LEyMEkVy7SuveVgXZ04/7zVNaobasyvm7J/tpGmDB8Eb9N1vLo2vHJq2ke1Q/untV0QM8ZGdmwlntNxcgNvlSPJ/VePMyVsHqEcgTALwEaZEvWLUiN/CnoTwWDlfndNZkwtBzGMCioefSjoD0ZkdHw5GbcN7ji3Ph4DsVivCM5aCm5jLQKezGIHWL6vpM5xbnCsqZedvDC42bQn6z3XVuJaqGXZRCWH0lXOuPRXMnR+n8p+dTCCaVZkTJd2Wix/2AmeUIuWdjgbsATLuOZGmpue1XtGr18TrgvCpae2isPbNi4dVM/cqKCrR0irhZDr7b/ovdkc8lP03vrYUoehidN4eWi5ZaqdTc90LTqHF/0+B+WTFiZ0XOQcbxo57PCfXkbD8EnT7LzE3tvbRcgqGNq8w6XJ8Bppln3wab8JMbo1Ain1JGjF8cao0eBPc7dnNQIAa4uffJ01EGF2QZTQ6jtbVBb5H9pjhN9YX2vRqikbPfb/UjvbWzUO5GCGCht3PQTE0ewo4x2IDlzu6IZLwot98aUnAX5nLxcyt1aTNv68rvqsqQzgpZWEQ2Ui1d2nNgC66/3yhdPTFUlucuEvnxwQmbKGqjdkMrfTTwQWhD0oPLbYPINSN29+oCv5EVs+U6uH5krv4M8ZSsN2wu2r5wXwjyvcPn9lfqbO0sGuuU6igZnf5eQgaY9zKn46OFjkA8S3ug6JWHK8S4QzXIeUgSHlbx4HzDMBz2kYnOq7Q8FYAL+GGHtOEGURHQklV8t8KURWkMTTFMB0PFgZpPWJ28U84uziWPey0yLhL74vLK6FJk7GPxmWXNFA2Qdm36xE1m6vWWseYy4kFuIqcLmdT+JbflHnYaS/2CFewq6tD+CPvpF7MLzSKeBam1aHqWpspAH4KbEn7LXAs+VAytzV6bgtqAXrjvTKLpY6VhKp9iDxur/2HDGTgnlX0dEJ5dmF6Yin75TIkxhkFJVBrA//1JZVQdfFN9+UlxRBI3O6rqwrkZW2438vKVh+Agj9M7lnkuM60v6T131H/pkqqQf/nRbab+Mce5g+/EljRXlTfBFsXB+UWUxRECnmGYzf8qu4yO/TqtNT2RkzQtWkAyNooKV//M8GK31YpSiZG2Yl8gD2TMasqpM/6bU78HFiZGLljZWNUKnDuZxBKX8LamI/qDkzaK4aMaAjcE35xk9573j48XI/JVXnw7r934U1ghyGEOzRCFFxVXVRfBgseDn+so6CNiqo6jSjs+6t8gQ/UHhttLeUcahX89C2bAEkeadzR+vHWgYRfR7P/fX9g4xGq0PEjC/qLKiGNYgsVbc92CLWiJTx3Kdp5pns9npTYN9w9Dm1eJcKTK7vgCZxwO3j18+sQ5J2wdmvZ3KS4LeymnvEiDr3krbvw7SYp2uKt10ymIB4XtVo1vqKc4lVvE+4LEUdVHRLEW76FHnahFf9R8bRx2pHIy2yAtqG7wQJKBpH0TJJtzE5UXjKGDcF7Hf7LpswltAs/M+0jCSnppI54UtSvLzXcyshd7ejB6rXnrwsfHVgvsdEgd/JN7g2G/B2khG78wPPWFvO9x0LtwKMdWjoNaLo9x+SYcd5kXYPVAeuUDlXF1yLsjKWxBxocXWm6e9qZVvZNPPQu11oaaUMqsV7pRSYRdFZKaJoy5UOF2Lkg9PmA9yIx7JZAFgUaIPQm6CyPYr0DRU0EVqc16g7aIbxEmj80TSNMlbpH6TxDLqpsryZZ2H7l1ZeCWfUZrfdt2wIy+S+IDMtttm/oIz1/y33j/u1wp/tfA+QIxRCL8j8XfRm1TuXPDe+q3DcafH2b+2Sk2cCSr43AJeR15VrbYSt8UHd65+8896lE4hbHkKq63whlcrnnTFO+JOYVjIiA9eo1GnfiFPL+CGtdjqOkF1i2myXBHlYos9Hail03aIKsMbJnR6JE3N3rb+sOon84s2yi8e43BnX+u+SwxfxeHzkjfsALVY2FugWR7oo0wmO52CoNZ7q/jQE3S14N6nOkC3HYu0J4fVee2QjsNYkHYIenjM60CKKFOJytxAD3ek2DyBwi+8AVVImDrtzQBuJ78MKpG44VmseCXshu9iUfSDD92aMPA7BBzDN3jDdcpPjmvNIpcxejj9fFyrlZixAy/el+gX3qiFHQtb8rQYVIWf+oM58r5opJ+frE/MeG4vfhHKw3dZkzl7avGlgHNH7JiJDhk24jROUQWpioiWiO5VBS2jvNBximRPjbimZrEXMYbPXWKcB5y70oXVXD6gSqopV5n1zT15sAh3J/delioc11hEXWhqFh9erb+islJ2B2/oCiNqLahCvcYVHzR4Q64zUBOJItxKFytVJRqZKxW1PtUhVAU1MC2myVE0q2a3zooOOz6i5oGJn+5dUCLtK7jM9GyaQhpTPZbbwzGF1FFXmbOM/kkme96KCT/nVIpS2XobLuOtuSE9zzcmTkZGJN0dxi7OWfg4xwO98H4v9PDKp0F7qFk1oTm4d78DVcwNpENSBSJCBVKE600pTItdmYNKLQ/I4Rmv2rZAq+AyjSOKhY2sp0Yc44gcddVVoRaWKCjpeVWJIkp113Ifc5pooh4X4u7HNvkzrUPrZIcP5siio7MF0ZE9gkOIpmZzWbNwDDuVb2ekyT9HtNj2ObzCq8HJyoKVEinNRjphFbdyRX03Wpksio50YY36qKHGhpUOfJ03QuDTMUTXsFPR7yjioG5I1+qpTsEUZR3WZReDYymRy0WzVERq2RZIPYtpkst5d+gYquSA7if9kAhYoRXD0nl5kJhwF29flOu3jS4etosNK472oPJ4vcMo6n78bSpbWX80hTdhWXLPtpxw3db1baP8lZdMFsJO3NBzDgJZwvr2nf4njBv1s/7mZYP0P6m2+e7JDlM2yptt9FYSDYCtoetWnfozk+lgV1Xrc+ejSyvl73wyTbByvgcTDBxA9tt815gv+Lfzthdt+48Yx7b60fkH/bbnNH/+/FnU1Nov5IjntOCP58/PsfjDPQW7Coi0nw+PMIEOs+58CqHuxOG7AZeqGf+0P/gAP4k6WVuugn170Mkwy7O/zg1zDiSLG7+Fx+1E3HdmMhjXELVF0769U0slOcEubSTqQDbkQLbnRoGL2cKxynaWHPbRATZp55mW+9vig2Uuy0iyDE/xiwnUviw8CqJhy4jEDzZUkkJ/O/330PfpOCcWDf/xj9O/7qrZvadWSjJVtUV2oQcA5lECGACCExhY4MRLwrEU6ZYt06ZsBQoI97k9TDDMOgoMBYjzfuDLchh3Fk4JaxGww6vvtYgPMDGR0wokK4bSEFudIQIYKnpaUhRg9YabCuPy4IoYsdxr/+YSESbMoc6ALVf+QsUQ2K7RoFfEYlQx2sDCU8enhIUlExYLBi4uMkaXuhnxiRTkKfUB2cU06ctS+nDkMjmzRBJZMVX5uK2a/86EiVTWy8wTqTeybn9kthWsCeEyNOWm8MX0CTC4LH7HNYl85Jzl48ZcZ9tqS49JYprMwKIQI0VydYFkO73V0SSNnPjO5ioBsayuPfEI1PRwNJFR6EGwOIvBhU9bVkJMM9iDIYE1FcOgYsYuZrI9B/wou2mLMfelId6IkXA5yt0AcRVFFXsFAKonwhOWEYa7ugmdEHB0D7gtfT8UAGIwklwBHIkyPCl9ajPAhUTopyxCS+psOObsZDLvDYQKaZJ0YD0+/uu/Kh8tqnI9Px4lXA3Ha5QSbnTsfFmeOqN2O8zO/4v8RUmTxjX/TJepH0HfH4CqljOR9q+OoPa7XrusRUivY+sxjSrpMAoOQmDCS3rQxGEcgBjb+I6oHfbgC0fwweFowXrEYqMKVCtfO7gDMYqTC+CC1YhQ1m6Ar/JgWAuBWoen0sfk8Gm5aoQgEMFgYzk8oQcXXIMPZpEOOo6CgzD4Ko8RIjgjEjnYCIES4k6biA8vrFGJjwhDEOyQoNa7I/anIxMDPQJrNj5LJi5DKHDg9bCyj7ASp2FbOlRMW63nMuGyOYS3jvY4QwjlggJ13S7PKIeIN2oZZtwlvoY4uALGgB0AtgjqDFUkb7sE6IlL1X898ZIq7EspAbCfFofXHsQwk3g7++chS7Z8AqlS8Iiw7DtHtWTOgn1P1VGv8xw/G2PhmDYvM3QaGvi3BOlEfE8xvhUON8KYwFUphHASRhJLjBOnNeeiHA4DZ6HGsvBn2aInze+zKwGa4t8KWAIAAAA=) format('woff2'); +} +@font-face{ + font-family: "dm"; + font-weight: normal; + font-style: normal; + unicode-range: U+0080-1AFF; + src: local('☺'), + url(data:font/woff2;charset=utf-8;base64,d09GMk9UVE8AADTEAAwAAAAAfnAAADR0AAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAADYHrfRocGyAcKgZgAIVSATYCJAOFEAQGBYF0ByAbmH0F3BjqYeMAw/KGq0RUj2pRlAvOVOD/SwI3hmhvqPXWMbaxcFVodQ89qEsVRm4InDI6Gs3VnVb38T+eMKtHpJAeluqjdxfY5AW/2Aji7+hAhpH18LRTMohDxurb7wiNfZL7A7/N3v+faEVwiCAtAlNswsSqkzkr7sxFpbE2F4WxLF0avWJR3uYiEj23E/7//948+9z3axuQActQPCviQYu1JhCmVuh5dt77WUKB5zYb0xj3A62tv5pdfs0/mds72PnOmAWKcrQJvdBLbhG19MIayF60FaCoVEvGmoFB5CpGov1mfcv9SVvt8a3BNzNJWdFTvZnl3HKm8v8lojXlwpz//zTV/86fzr1+pPxq3tFP5bU0YVQJFSCH8NdldEEuuzRFyE+cEobADpqDiltbjmznuokcaPoc3vfmjPrz3/8A1Q2fzsM1NRo9YABbvk9KwBmpuqGbkBNum2Pm9NyEBWCXwrUppZRt/3790tnL7UtoVTiUOSvJ/su1t+m06lrTTe4ZF4uQSDyOQWOs4P/fL9Pev8suydKsK9PxiyhshanNf/8n82dmCYILOFmeZAk+vh/gaZdeYJK/nWUiVSA4Hi3DG+daWWP3VLW2sk5UycoKSbLLnApkQEWbzdO7x+hcD47hVB5kI7ZqP8Ml3IABvF8ggEMBQRDmiWzS+LjJkya7fHqROvwx+vO7Bl5bGHjzwK8YsMMNMWmY4PG85fcwjFcbvqARS3+2OwjIJ4193M0Mz0mhw0U8mPnNbv+UMyb9AwjggApsDmRcePEVLIpRimwFJphunqVKrVdjq70OOabRBZ16XXbdbf967KnnXvrgk6/6pUEAwRACQkboiA222CNAipKReOCFFj8CCSeKWEaTzt8UMoFpLKSUCrZRx14Oc5KztNGLmds84g2fGESGXaPWHmGRVfa2waGO9F9Ocob/8XjP8lKXer23utZ7fNinfMYt7vIV3/K/fuxX/ujv6KBcvFhwCcm58KQXKExey8jKwZujdxiflArhllB2Q7/XPvpuKn5q05C2uMlS5JxnPhmyZSrVbNC8Xa8dew8nSJ4uS+5CJSt0qRv19bhXfexHuC+yKhvHmWCOc57nfBa0+HKrrrPxFnvZ+/AEk0870xzzL7byx+e7L7Sc3hpOOMREzBx8tKQoQHI/6V4QnnxeBGVnVAsOAYkIogJzCM9jDiFynjBwiM/l8y5RU6i/akTP4UWA+trG6wXECKBnipp9SRGgfKeK2Rpe7RisSP9gAznDFqJQsOM9P/FQtn6Xjni2dEET4Qp8wz87Wpx8iqOv5WZl6LftqlNrYOwlHCUmSssZa9yiDo5y2ylGYQUy0G6PxN/PC4NNgjWo+uRLOrIfQ+ulcU9RL9FS2U227AfPjpPsm9ONStCVzViZ3jnbqfv+jEpfmzg7ugVTasuE4QKfRtdf4PaNyS17Z/fLpYBqNVcJxVAgWQlOrdwa0zmjRqehNBAhroo9Le0OUW6DjEGXglh0fwZpRX2/JdIWZWrMs75j2uwglFHPw27UP7CcDNT9IQD3gwXPdF2lPsf4GhNFqbzAsOncxUuuGV9epFYYArSFV1zeAouss5toxjtvO7mvbtVixUPtJl6NLdyDnfseE6/EH9+y8l/Xf8dK/9+ktcmgzTSVnGXIjRLUPJuBi2z1Vbxl1yEktjC5bOE0eNhs5Quiz0aa2hdbfqbFCIAVEDdwoL3FsQqcjjCKKEShRM5jAQ4JBRPbdWQiiALEawKM2HnvEQgYROicVPNWpqy3CFYdD8DQNmC0cMpKtU75k1kFUAKZo8fIo3bL65k2bGJS0U32XfVYwx66fxTvKvyr9uWoPXhpjP5CRn2qazK/lv5vhbz/mycWnnTIPi5YmwhVqURjwdlFZss5ASDw90ARyFdggkJYDvthHGyEXbAEamEbLITpMA1WwhzYAVuhHubDXtgO1bAB9lx1J6REsE6ocqAUFrHTtDRYweZgMRTPoexImyDnS1g1oRwOCweK4CgcgiPCsQanOjw+cAY0WnD2qs3gXNBiugA6hbYGXXAZeuEqXIIrJjO4KVxLw13hVod7V30A+oJ+033w9PcOHg08C+rAWlAJ6epgC+TWsVbwEEpnQDeUH9ADXkBFhgqwBio9QA1oAPugKQUSYUAAhQDlIch0zvDIBd4espgUKxMDJ03pqTjtTH1pEB+Z3cwN+aVidvGktC1zynvVtOoLRVUvrD83s5o9TU+zW5BD/7zg38P+fde/j/zz8L9C/+r9n6v/I/1Pzb82/qs9JENXkUXksVB56JzVwP8m/1vVqazT9k6/d8pZ09exX5dN418wd5rLtMRpu6ZdCfcwdjfeZpxtfN/9l2tObaMKqbF///3f877x+KbnG872WkG+oD/ifZv67dFv7/1P9LvI745+d6/zBSahJlUmwxd6pjtM39pfLBQLi4Sv8q+nW/83839/TP96+obpW6a3MLyYFZPAlDOfhPPY+hn/7BI341XXm2YWznzcbfCsHbNOd794dtWcRXOu9yyc+05+/7zB3tfNb1ngsCB5Qd6Cob49FjYu4vcLXtSyeM3iksV/O09eUrd0xdIjy2YuUy57OviG5aUr6BXxKyaHjjI7bGbwmGvesJJambayd4S3KnJVtcVsCy+LZot3mntWl6wRrvFd02m5zFJ7yeDvb156sdW1yy60zrC+fnkvm59tamwmr+hjG2FbbfshYKJdvt3YVfIqS3u1/emrThsGOPzqMHCNaO2utR+Cx607uv7z9fL1565fsqFogz7stY1c+OObbo52c2xxWulU5LzeeXCc2OXq+C9dY914bhvdctweTvRzb/RY4bHNY6zAR9Qm4ox3eHZ4/r/g7aXx6vTmea/3TvZu8nH0+d2n1edD4oO+R33HZ+zyN/K38E/0z/PXpxQHXJu9WNwVSAbGBJ6dax20P/izlnNwZcjykHMLNoTuC5383x/CdDdYS2qk06WHZf+RyWUTN82UB8k7FEKFXHHqFqPwdeG54TdutYj4NeLJbXaRxyP/LuoeFR915vaB2wdvn7zDJrooxiJGHdNG9ubagAczDDvBZNJp1h406izKxGE8J5+2zJSZMIx3db0RQooCVUMDIYHgRr5CrlLJGRIHlV2PXYT6ht7gPs4gIlUJAYIE/qPmnYsn+35iYrNNFO1AI+PGCRAAX4MDzu6a2x3z2KdfJ49dWtXvI9O3mu58Fj66CdnsGSX9FMpSeNpkIUnZbbhGQMQL16XZeVsYXcYlpniYuG3/F2PvD+YhWuxNORwrGl8N3AThn/uxQEj/uRfzeNJbkYCeXNbaLNM+KC1jerqeVd/tUmtJIz1hg8Ibd8p2ytgdqh1pKuEMXmhfA/b+ZoHcjrLDNui6k6Z63xvuPBabJQsWBh2W5ccwq1adnXr/d6eouuk4DtoaJVonmhhZUTFSOUt+kodSdMf2ZlUVo2ubenbECKL0LtmhUDpEpkmPYcHqRsHIygCLVXKxlZWs+37bqp7PXgj0VCZloQ5vibmiJEGA4jKUGCIyXYEOme/qqjJq6SKBKh3jmPF4Ig9yIQZ5y0OcqdU+Zw7jaLtbCEeNiQy6dq2TQwxmcvMMavzXXS1D7Jq2fle4rmyE6rXknxEjtxCiLWgFQDvTusDW1j9sgZV9efsoI3Yn/Bs+5HfkQ5FVoFff2i2uZrWLq4M/orBlWjXTu/m3DTZtXjvZ3HhvSCPTfcnZdPOshnk0KoauRnpBnbg7hgXX2YI7bc+sWBr9A7JHm2U/PPLTQYbtkcYTTAsOYDaAaeOlmUbyfFVjAyHBwEa+QpGmUjCYGbHfchITtu/djPrNO+qsuuqaCpyvsvHQfCB2YirjOU9aYJvgBAWgYB7yeI6s8Ay3z2fC4OfQNNOhwyCYjgxizTfhAxcjwXuMZAr9LSE1kHZSNj3Y//WZnWsq1xT72TOfEZXFPs0b6Tkh5giM3zTmZeuWdTfeZ8k9erQ1qRNseA4b2jB8b6OqLT9mC95lbj3TeYzI3sr3+1MXpy/eNcUI1lTAHFkSm1KWjszYehMeUUBMQBPTMiKE+9vhuf7u+r7tVqoGaBlWVtsp/ZxYDgpTIYUMJPSCcsg5jGUs5dz/algeu/iGANCDB4EBOziMUcaGLUeMsWGPgcUAZQZ1DlaI0vtyozOFWvHpo01pVUzRYSVx0udnl4cwS3hH5guqTpzNK9W/RrHSuyJkFNUWYtFk68kmn7CpAJpESK+oQvwW62A9fFM2SvQKGt+bamlOXB8Zd3T5fk3vqHGv6vyZCdUXqr+tym75PlYkUsb4suTWNGIbLRxptrO4TkhiZTOUSGvkNCguRQUniULeRHZcUFqGjArjLEjCHluPMacotI/nsua7AII0/iCMTzeGHgVEuUuhhzNJV0K4SZ8UQP6SSFVCnJvE8ySOfX6tBWqi6LprCAIfZUhUgmlhJ5g2duImpa/zjKtqdoPFUBbmUuvcpRyJq+hJeU2r6pZa5wTlzz9fIurmwjvOZF4qspwK4dYeEpuERlB9Yi9BqcryEqsOlG+zMhkhq67du5yfTmoBcXwHPnyddlWmW7e0Th+9qnnMhVt9TG5d9p08P1QVqpRvsDZiw1SYXFijG417RMgHp7AZ2vGZc4rAc7ufYQzSrWjd5s7iig+Qe3jlGekGT+xY7cLP8/V8DXR3s+RBgAmOJw8FPMGR8vEy2FQbx+3HcO4+Img4Ybtxn1Ms1VKuAo9sxefpuH7qlLh08Q9GIX81s33VV3zmpCJQMgQJDCygUd/UjynYzX+I9YB3JwINWxOFbMMN9g6aEIeJky21Kxso1CPENZCJS41LSUo2ksKh1f08WfiFmTjN1LZ15zZ00V2iho2igARFKtO0maj+yiHxgvbiXSXr93to96qXRc31DHFVGNZUzrqA2G3uoxrQ6yga3/Hz+ZXx1eHZLLVdU6c0ONz0udEnncJCtyZvY8oSiLyPzUIPaKg7GYlQprizqeE8KlNT+HzHj6kSWbwmIDD9YPkKG5tCUCpNRPHJ7Uv5GM1fCRLhNml+Vzaldm+GuzdTmKJg+Q2HE2Jiestj3G0ww08USGB2f+hqy6h91LCyqryKod6/2L6sgyFEzXj95RHrfRk/ma9sjhB5RyaDA/B/ruj9hvHqIKZF26a40RTnOLdm1MNK3UyNqny5QqWSK/JVDQ16EoAkrPDDggt+2XiIL8Q2YM4NAhtmMotZ5UHAAgtsZpW19W1hCzbYczqJkA+jP3S5fsF79oWPKv+vVwcgMgLGrEQK0L2wAa3vEDt5qBLuAkiN890DUybp+uqwk2AZVPLPnfPWr3cKncGSzwwYWYMFYtuu9v/8aeOyOUrIvH4OenmGfW36iARtaW1d9TovUcSWlAAFE50cnRwg9GpWdaQw7Sk9ibZC2TZ/l5SY/TlrTKQLoappP2qhAVfddPLt3HqbRtavZFtOAV1epLbqYH1p3HUf/X3MJNNjZLIXCbuQXW/hdMbWRJQRVJQ/R8aMx8wokKqkm5mGzUFF7vQG/zjXOLbbnTj/SznJgOsfR6KLtyMra8dWegYSC3FE36vjX9atLel8zboVVM3ezs+u6k7bULjWmRmbEudBuyXXAN3ToYO2fumPSg91AuIvL1bFWMd/aHy6NFZBR8RWXNy9uvzvhwfwrRSRyREY9eNA8mu4ohVflZuuQA2h99mh86GuQPpmTHydAYqE+nJAh92N1ZeH9g2+0r5jQ3HKftMol8ZM7cM9saarN5QHAKk5M/HapW8tN6VaT88VV22g0bPnPHT3YAvi6Ma2213a2FV85vb3sV1sozvzo6n5uKAWsrbWQNOTkuBSBB7HjAKSZLA6v9D8hWt8g7FVlp7SbiM+wrnJ8i89/m7+8a5LerEGnJykTpApE4+WbTJSPyKkoXq7hR79ra7F05iEpACcWJFwiA1MYnYUSW0iKdXFYBah5MhYHY4oFKtpt8NbOzD9mPvzac0j3LKPZbCPUelFBTyxDsFqxXrx045Fz94Eq7xGdaV46fj8lrvP50GreLVwaw6tp+xJoFExFWW82YPxeEqHPluMS3n+A0ttrCNfYA7mL3rz3gslB3RqvfVDFqMaoW6y8VGXUXu6W50nPccJSRzHsTuZdObCBmPCi8JyTt1/J7++rLT0FRvcRSxITNsuptduKQGGVw0Ly/kYc9sQw7gwpa+aQUEQT37xOsgh//7+lsp185HDdgvmebGwU2dS57Ex1HioFpB8daEmFtIiZeMwFxJQM8dA9LgrnD2BD+QXnaBYxv2knmHBReceYuxjiKXNA7fcyQ8ZFowW+OKFDV/Dotsie+ZrV6desBhGx5FHGjFMFo1wym9CNwQcO1jPmzvT7IPJjsb4mqNFuO31SKy38dTg6Un97gPedTxRdLSXKLzubOw6nkV/4lmsvWauPBXP+57U5rXAX/s5xm0DbZuNR9pMO8vb2rvmtTk6Boc6OXaGXffWfnVHSnmd32D1ePXPnPy9UUayJI+vt3Buu62z2Jp+yMz4cTOtTjpQTEjIFkE+vNigJGm4UNwW21VkFqoskDSS+ldvv5RhDKiOcT2hwhwlUZByUzSrvYXo+tqqtIHSowfbclmqNiOvcLOD7lvfEzcgvzqiMC1eFUOrs/YWrLCU5/H01HwxTZraos8hCH027hkOnRUgQ5A4TnAFu9v4tUWyJF4lyqiovMcEzQIHlY2U6qzAj0kTOTT5BLXRxM0zFLUUTDxQFd9WejlrTP8EoT1KNKbsLWBhmgWSDhtS40dJe44cFNJ7dmMWD1X4f0Glu7RfKy4kZwJRQl1569DoYRP1Zt+ZO0KNfvOvWVeQ5DWYYMHZDhvlPJfq1HnZtZbPOj7s2ssSiujKzcfi4jZvjtu2O2vfT3BLWIL3Lz9Q7UYuVzDLSoDsno7ytAUayonTpwO9CKE0ZRlrlFJvYElI0tQXff5m4Apb15uF5F6Hw38MO+4lSEZMkwVEh9ncDBNNBmJuQ80tHMmUOyFtQcJqAMPeZfZSEoY5OrEUYQ5peFCyEmHsAmb4BZgYD6ZriYpJoyhveqLEkJ0lN5usb4Q7XeD4xhR0pvQi35jSgDRCSS9gigPrhau5zFMcneApxuQwVfA/OZODTYZH2TxX+nbIODxWkL5Cs8JsFc4qVDQ/NuKetjRFoDh2BLKOeb1Df2Aprv2rgth9FLxLL7IuhiE5k8BMzSLqvUVWzV280NN3FosqNAt1PmZnpctZRPm9lrZhUUj1SN8Y/eEwZMot5NYYE+Fv4eZNmZQxYrsM+Wkz2v9c442JVvN4Xm60Y8edS0vep/sIo4I+J6BLyVdnW2MBVTbQCBeh+Bt+QaPiRhTjmwoiCo9BxkAG0XeMkYaDOZdxNBmocuBYc/szgb8M+Vqd6E1wseEhhX7lzqayhyKBeB+GN+iXbja+aZcli7GB0JSlPjvOd8UGAvdL15IjxPmh5AiOZnC23trUgYEOW8frYMKpl8FUgfgAIQg9YemBVajvBioO4eCeqOB6XnN8qAfCvJ0jxLSCFxiLldvPWgSIdV24hIumLgqgFaUMO2XhhLJBiHDhQoyA3RiQCoIwdVf2QsQpx+YgSZY+pjcVff5Q2dmIwk7xwwrmGPc4YAn+qTuhTzs6jBfcHuAcjYvzQ77/8kIkKxBsiIjPEZ2scDW4xJicOC+kuXC8l1sQhsAdsD/gPgDSiB6Rbm1zn1zGxI1MyGsTyzI14K4Dbl4OUcY0kD8H+cMx7WZ7Wf8QpM0Cft6IKZMI+Qbx4RmeUOhtd5dORJRBBHpziNMu3CS9KW54I7lXv3GG8EJ+BtvSOJCx6jIULbXkiMdpevh9JX6LXRiINRTU9Yx8STc8SoatfTH+vybBKMIX1W5DF9Gnoz+XHs++1F1QnDPJEK6k/5NpHVUwN9tHjYmmTApXshKxoNdCD6d+T1nhoSpgETRDJU93T1+42DMIiR7pK/1LWQlyGx8JrMNWR0lehy0OHqkgIs5DA7dzA/c6EWhPT1k4ikKj0wFBfi5dQScYnwo2iMh9NcEBZeMA+oj7FPokdujMB6YecIdaLjGh7N8SXjsMjYI245a1/pMQRhxkFC5JZqnuqITL/z0EDiFRMIx8OzT0M4YKeRAfxmf35Oq5GBrlFkVQMGMDIk8zXJgEwjOdLmgfNajfFIBt+TXgPo4ZQiXCZzP6wI0KZ7+ht9i7nQ4IxgR81jexc44K6NBek9sfO4/Rdmg34TtDeOzHPXXQUyLtwykZLfI/x0lllxsTYYpE7cHeUD7FYLdCt4Ty0wcd0Ech+TL3tWHOcUd4huEEcYNAdskWX4Y1zFeS2jZltevblNv2+OfP61XyjrrOLW0+xMCOgJWd9uK6Evb057UPrVEwruhjNgv7W6B9TBfEERcnn5d1/Pm9TGq70t1nISPZ2MID7PPrau4TuGRjQEn7IktypsH5kIehsc0thcbyN7iuDeukK31rmforRIshIc+M9xTK5y+xQDmHak5nsx+IkK8fxVX4xBOcaneXwjfdSlc6ydzUbONPhs1CZg2v9M59aXFCw0yl2dJNGvyq1V/PP7jzim6sYUu7XinhPxxobBsjk33Ga7uVOYpLR+2Mtidc1MKB8IVfKwxYgMVxMLAP+1yMAbwk9vw7ASS+3crQTvFOn2Ymx7/N/mktQP8SGWLd6JlSIMCnhCV9bfb/egQvbaZ27zpyh2T8uglvZsXy+p1Jh9LaeqMak7BLDcz6mv/XSyCoy5X7LZd3bJ+7N/krb/l1FifET4fNvJBwhMBHfunATgRlEZ0uFV99bBfJICLiDRFidx74R8BDW+/gkXvZdfla05di5uPumMEstN389IYen9wYG5OTWWVxFS7ZsUeijkZgdjC3M5nsu5p7GE4rt3BAm2MY3jk2y+RdNS1bXfSF6m/b3cgl99azjeDYFWfcn+UMwePocUOwpPICYhMS3Wlnl8p7D7GUlON7x+oXj+EpB26CNlOM1UHH0Suqs3U4VwNjgs7qMrzDW8vUBiTE7Ac8mbyjs1PWrmN4cszGw6QAPE9bBGIQ1R2sWMNevZHO0Rx3csiBp7frDnV3HYEjJDXZ0z9qhrKMZROt57KkLk1ZELXY27SZEuNBaYsuRR5fDXXvSw/9/h8NejLyg5QTKh7RxgYro/Ba2dBzJkGJvK3pTNHlfHictlyABEQOej80ghPGmmGKZ1gvlV5PICrrBePE1YuZzfxrFs1eHVhy8UGGTA7ruIy1qwPdHfYuNBHGnRr6FxRQnLF0Qkz2wbs3dDNvCO9lHD4r2Lk34jnagwqMpxdbr0ExO062WrZUNA4pRBW/HfNCHz7GVaQs7Ff2mYFXCAx8VfikFqLG+0qQKMjkPV1SlJdh/LIdZzLuWk6q22gr+36DxCuFy+XRZojNPnUghv+G8EvJukNxOMOxIQ11jBpofxAoM8ExRj49rMPVUxnGnTrIFZyV21wG5wn/2dcdzjGMf/SoP1+TJHg2HUYqcuZ/HPpbTx8bVHnYKAfsFg9AEaU6Hy56/tNEbCjvXvvN8xYJ4TrPJWiU7sjBz05v2/mJHarY7/5wtI6RnGlqgp7HCtK5+0XG3VTPQa8W90oqDA4XC6hd+JeSrh0YJfgtugtrbI5qXzlfci0W5703areHnxAT5yxE9FGGWxE3FMvuR2T3kRWFlDimSioMt+1kDPqnGVslmJdqeui+UtfZlREllltjgXGiiw4x+KoeYhQccSBU7CkmQ7GD1SrAxBOGRXQZg87HopkHnZCfsnRTxBpo66dLOcOJHY3BHJ7gW3wuUKDvO2A6thcQpWfAHTMAyiBc0JdJju4jFy8TZLGxYaRSdS8DbGVum8NCiv5DvqDajd94D1JHYWpCNkqsdmTFG7G4tc471E6st2fFyoVmX5L2/SZEYElEw1ZGw0GGJv2KPELkIaL5IpcRPv43HbVRXfImDelOhfCgZHyvT81zzjlvvYV5AFhg7j+2iF2UTRRorPysaSSNBRwykbsXXGC7ds5xOc4+/Zo4dD9ijV3yUcNEDuY1nNnAUoy4jbXkexrFfIIyFvbSaJ39OCtNft82ku8doq7ou4vIA9enIrTGN2eSsDP/dTvtrLoDquHiynPWH5zIRJNSZNg3T3DhcqRn4HfTN8IicU73CxoOmndDEMt+pDPmJJiiffeacB0VenfymZHf4/IN0rrhAA1TSpyF/t4Prtc9JU27AO9cGJolfWm+SdOpGWUSY4nBfnymFiWUhuOhF+RHkQ7mCbafDECa9qNDCR+pr9Fbxhc3uTXJ8T9vdurfKuxmFBDatw2FDrr9WdjHuGvx4JLs5Ri8dMmGjtcCfSkugU0x0kaa4XFSsHtRvJDgmiQwVAyRc9QJBzMjzryzcw9+ftLi5KdIbZl8Y/K1eZqu0EI2r3xxFt+29BxlZ6Cgma+mgU1mz09dVpi8Pf7H7cxHS6OAqivg2fki22gPH6bbnbi0fcuibvD6xhUMXgn/9cfSdyTTOJWJk5Hg3Y+kMt9A0OJ8fEvgcv74UJt4R9rRp+K896VefGA8g7IT7pK0TdFdLCBZ4hHH8gTyUAgNkSa8yVUNYWqkbgpv/qTHwhDH0ujw8JJOsYw84ci4A1t1lQmlkavFksa+HRI/xmqOE3f05aVEnhtXpmZcszzjJMHd7ibqzGDGrSBLC1leXSGyeEjHZaybebFE0vbNeQcMkDeA8xW7MIe4OtM33IpGmHYTXCjKHwFSPbc6nzCyrnLk3vmXmfCl4n8RCjiEXXf2BfUBlkRZIxIarEybnUEZoep1AEahzB9v+vniNm/2JWeiq59aTM/fOHBmlcepxOufYoWCBvebQciJ7D9kBsbG7n0lAPLbax6Az3/TMgnR170/1QQ26hmDlFZqA2zC2/lV/+shYQ0mEJ161wVxA/SAckBSTFd4A3hchOYpC3irTNEdzCCHzeRjo3mn6nbzntZXY6Drq8sV2nsCNKWIwxMtpEUai9amnkof48r2jNLtjubxwFqy20+gTfNqYS/qKvIHe/kdYfFQvHnMeG4D8yVVGQcv6HS9bXRKenEuEljMLdkqKQCH+n8zeJ5hqX+zLmBnVhuVGF+GhXq00CNHyCC2i+OPAf15ThcrfbEtqT8PaVKp97cjW/Xdks6ZAS24t+pG5S6E1qTg2p5fzvzAUvF2QYO3Bm5N3OAs3VxnhTLKg17qcv8PD/1DuEfyyAPMhk0MwTwzL2Xbbn4P22gPjkTxMa7yjOWaMWtutTkIYWw58iKOi5hQjMcJI0zS0ZcQYZi25+LWw3tI5y7UCBCt5j65Y5/dG+ADqBNeCqgjmqcPOh0DjoWm795kKTUJmiQjnzRlbDD980+YfYYynYnfP1PcSsHwxubfS9mcGqVP4aNeFxlkH8CCrz6QBvCeXOWK0WMxOAEjr4O4Jz+SLuKVMqeDqxS75PXRG7RJLzV3QGgorvLLZUz+o7SyoeFcgrXy9qazj13JhxscPMchl/sneUpZOBj/ewznwo8prJV7ll5eeSv/aTwsxaVnH2DdzE3hfjvn9BgmLeTiLbk1zx6qbz/U5ntlGyW/B7Dpt7wvuj7WbYkmim8JyfGnfbQFesDhPErv+qFzIOqebeCKnw4TSxnMCe4TVusNTum0Q7bzhO7aWG+KPlywzIzaXdiFI4TY0vjV2YwNetfcMgK989y3zgs3GMsbh9w6Ma6HHYZuYp016p2P7ondG0kP7fl6IGY9JdZtPVMfHdbLq19yeaqBZ+cKyhRHPNcLB/hkmZq2+8A2A9Uxq38k1dxUwDf/a6IXObr6QJtNnPUzKPUEJaQUGvMQL4JYo81ciIHGUafLEEEoYWeLTgwNRhDG6Ioc83S5483qUaRnIoLs4qcfe7gHNOAe2wN0iYc4Y92omdlMNmkK0Y9PBt8cvgcJeoWVxHOOajik/MRbzLofngcBmkNBf70Qruj2aZiWA5nix6aShiseJAWgMm5qBl7EJzR1SX/dt3qorxCULUMChf3n8+2gpRSetNoKjn4bQaWEtgUcbMee4YDwbG/+VAxx39BPwrpyMOdy1rQZM1mxfge7nfsKizNoanRgeBNpbl4YlzMJpWUETc8BVpVD6Q/H6+zv1gJUP+NrtZzGACxu6g1UPLIgnats+K26AGxKMoyvK7xW7prraCNOOD10Ady2V5SeDEEqQ5pH8e5fUpWr9xJ8XFYV+pfY0Og2aBZGelCOQrpVd9OmWhbXyqlJ4kRLevbaF+CDvbP1hNBtQT4KSARN1RJvD7kL1R60+p5ezXg98k3V3CTiB+RN52omQjGXbsOrpIC3RIF8mVLyA1fntJJ90EVcuzIJMCVrnNDs8pI8YMxUoD1GQ4gw3weOkJzNx6c0OocD7qNxcQeLEQ7NGUseImK9vDkBMRLtrrREhK0nACAbUxtAA7d2RbLjmdiuLY4atbbGksZoprGWt3tA0avMYVq7G/m23jbBrANHpuHRrOm/PpUBKIzhS62jcF1b0bmVdtQtacwgpMjR4uj5qzaVdA3XlV3k2/Tqtvfr6nhkLxCPWyOX+Aa+Fq5w5lGTW2/yHpNvrS6D56E7ex9jHBy9A1w29W6/xbGWxQjpHesS40TP2qQ782N3c4fHUjjpgT59wyF1ybRSJVhqwLowAbfXYp3b+uwl9JMsnD5KFKz+7D2Xww/r1anCshhSQr06j0mpTlrrd/hGSihRQkaLaKEsiliuN2RhCgyT/Zbw/SX0FWkBFDLGpVDm8TlCHEo39IgenUMb7eCAn/kpJBcVZ4zCGH50lbu1ffy8xPov/b420sNvr2b5KPvYkASGmsRu6Y7F3yH21XZdntu22VQCDqZ7Bt3D3UlwsodAYZwfuBwLuV9ClcOMjaeCUpXjxSmkagVceFVKsiwhMMP+rGTcf4nkn+f8H4SafLog3b74O0TkCT42VNQ2pVDaS0rSCNTwOo8jAJvmneZ4sFRTPknobaNY+3DEqo0D63PGD224oDHbBs/2xqEFI3Ax3/sm8nwG0pqmMZD9zBQMgVfCi+CwGC6GAGZyv8RbOhKrVAYBCrax/9U12QiJFIdES12zXF5DOfLFTG2/r80WCGY3Te1WmCNLPmeqeb0DO8F3cAUnUePuPsG38RyHi5Yv/trUbhX/obayUwBxbuFSvj49/DGz296SIh4O2OJvY9SJcuWnEp8LJYJ2pskpMulale1YSfl7NoO59eDSjyNK0wMIrjTohjgfo0TjiFY/0/GjCyCeFnyQFiaoltMpnih/xZX9EOVUtNUfC+dR8AropgtvdYEjaD0EuLi668jXsYTfAFxHmktgcxE6d5Uf9NPzkjDzn7bXqU0Kcu7/eLeV6nNIEJe7uapSrXawgU29BGS1Wo9+nIfsglHRer40L8Ybal5i0dGSRhIqNT/qkWgII4SKjE+jGJNSCK9eE/DI8z5iMYjSy6w6eWyQ51JZgjHEYCsWEssiMYYr5EBcXz3oIJKSn8CrVoqe9B2hFtSRCvJxpRSHnagcoYm79lnkcUvEwoUslgrtyzyQuT+zNMzkzKCKgTcLX4mVU5/KDYuVHLwQXYr3CzMLtxcYHd9TswPkaNVCdzFcLGBxXPAYzFLq62mJoQjNYhXCgqwHu7LFxssU4YGs+eTcyuv535Sd75Wx2lbny3GqnSpaYvazvYLNPhAWSCAWTuCZD1HDpvD+j+oXDVMnkKCDgYFKPLlBNOUIc7n+kqb+CLdQ9AwVkJ/QS7xJJHF9XbUGxq+/5EQ+FJUFfvkApycGqE4Mw9xjJSPuZLidFIDq7B14gXPdWRZ+zVJSdPjK/VQ62lhT/lHgel5brvbqYc9sdty8yzda1C+lUdx3JTKYeVIP5UPsu7mbb1otXLRJjLjLvIp7phhPMUF5utQ/vvUjD07naoDbD7p+PbrJgnUGhzM2/kOGup8ILG6jey4vvV+7Sh3Yzpza4lTfaF+1lEbecMd55jdmgf+d7LPHyzdHCL0dg2wdRbVXpzPeXoTN+aab3vAgONh49umUVlR9w4KVnkROXtSQ0QPEFkXyksr0k91u6e0Ar5MCXnWU2oBdjZaonMCzbCFUPfdGheQs4c7hAjQFnYJ07QrjVhFw5c6hZ0x1MslWv/QmDRmpNZP1a+8JsQho86eiyacn8vKTpr2EmPdJNsbp/SCHmQ907b5J93tbxqPKLS1jCxSUdBnJNtTvijk/MiFaUoLNwet23PpXbxeG3RO/p4mQJ6fQ3LMl6l6mkTqo6SeYeZRHUidHc8+e5z7Qb+U+2OMS/vCnmYY4nSO75tK4fLXquL/LPX2NGOZj8poyAJrLd0FtF3PWKmxQY+dJeGRqrEncznZP1gDPqKaP0nM1DtI9+hncWA1N/CpdJPkchzZ1Sa7t2NjaSQU0YoC0O4nPLJK1oQCffaxLX4TS5mSTNg2ijFnaMvZom7DBpXJ9VL6vgT7mQwPH1diU+meguOIQqESYNH4TZnuoFPLjz28funCrTkuV+0Cr3kRULeatNLDWY0JsRKw6cwH8JR92du1E/oqZRs3CTdIiYTXHQl50NY91j+PTm1EWE4HfI4PFfXkjeUn7ZBt39Mm7DWRPwlPoog0Pl6wVbVXaStlb9SPivHU4itxVa+FWemnhcO0KizuW8pXDKJ+HUegQ+W5q65PvINzjs5WW4bPvvB8GYZnL7+JJmI55LMTJZ5i0VNpMiSE0pJrE/CtNMcvWb4t1B2FdkOmwnSa4nC3i4Orpa9dF6udXLmDANgepaXYUiMYBp8Q+gGPYfybscyWQBAfxNge25kmiTbSlOYavztcSz7Wvkw5cE3BP8osg2AHEf+CoD4HjWk/N+xfyq2KGIESAcDiYiFmNw+NnhRAJreWs/ZncaTauoe8sTeZ+CyHKxibOJf1+qbasPr+2Y9WriOn/iThSaFe/np67yA65r7FTuyVK8ii/M5fnlpIsUybmnEfIxU3Sv4JPzLHgeYDccgmMjBUVzSO2gYhwhG+0eFUsuMmAqCjtvdIWSpVpo8cRYhgKM0WAudBwEZjMPCX3XAEBVJ12NSHcMq0eEKQ3FgSnaPJGhHhVmiLhjfaJdQsw3oEYlG9deJfkCH7p7LhXAASMc3CcPM9Ck/Siggwsb72WQiJqibqpK+NTXvKl53k3z/rBjDXPlF/lc0OeWEeEUK7m6NyruRo3KpTvapDEQngGnA0edDx/qKlp+I+OgXdz+pG6emJgFwVmJX2t4YHXW4e4ECVDXX2jUGoWSilW19MyNUE1QnN1qasPCoba6ljcRzQZlCtdfranC8Qm2/EMIVHZSNRLycFbdJa1krmvv7jyBK8+vfzpata+mpiSEpwSRnv6Nes8eENZzcoGlKS5BIWHO8OEwXb8UBUKU5mIsDqeYK3hOuld0NhrgpAU91Y48+c4IYg6kWL2ntgpp/fswSw+qt+nMZ37rzXn8mORpPx5/9vZn46aqNK9EXlu/O8/4mP2oVFEM/WlOSLUrgrTjikaquR6fBXilKGMYRvK9nNj33GEUkJ7zUlTUGMOeJp36ycmOaUB9KalwWE6ac8go8xrzpl9MCRFuIOT/TW3TU1Ze9HvPq2QUquxKkrt84wUPsNfDFbEprzgOEKC7mA25zSKopETalokPiRZFqGpGabTBSxhlM0LWRpAgfGcd9UYz4+n2N69WACUYamKKKImogvapguwLyuagUbN7LGEuKAeDHs6oHT7Exx5p2/5eKu2O8h3a+JDUuSRsk1d67eEENVU9Nzmjt01F1Q8m0NhgIk24pOgpPKm0j/mP4wsDKRF7h3nY2edn/stHfBi+ZibxeuM23PtWJEKyZ0Rn3Hb/dbtFY2/L1wrojm/SsSKLwiHHTAy9ZJPBp2knEtXKy1cuAGTcudhW1phldQncBYfHgSxxkcBZx6SXExw0VNtcX2SpWN8cyYG+YUhzSFYDConyF5PpAvfk/Y5CuTvYE5Lb1AfHmPp7rSm8+8oiUfG6FKVTUFkMkbHYh2FyNvjcflhlkD/HKMMS5vSL9JpfObALOeyaAJizIkG6BTiiRM//ChjULxANpdgoRcKPyxhR2qk1B6sDankmjNp6RP2Trx4vcgsJJdLpaxjxQbRU7/W7PWYyoeEem13x6LDatdohKuBBjAO7kHFcM0L4hMc9HxPhIYaQ+IfXJsM2IHcGGUZ/2akM9Kbojj0gQsdR9fERtSxSkN6Pba8IOkpKih6a1gO78JUbLghDHB1S/QWzoV00YfPFbK5prUsh8DlkcieMimZcepcvhqGkjyXzD7yXpRQ4U+893T1MmXjUbHgSuWSkyEk6XadredfKPQrBcf8gLFCPe/tN224fqksSA5IXMOfHdR6/AJkwG9ilOheBqb7ZKqZRKj3lpRpqTDnTvFYC1Ai4OPzjO4NLR2CveOnDkb2fXu00KpsjdNhF08gu7GEoFawqaknlqaXu0m5lIJXcbIfnSkca7T0jKyNcjOWLndA6TIH+EfRJRjIPvIX3JHlr0KmNgphUybjxdO8m9bhDQW8yzgeB6u57NNmwsnu1mNulj7LD19yLY7UOgx8qV5OSwLkVX9beKDztfaUe8LDIGKKm5zwnOvWCTJgNIzpss4eAn4hI2AJ11/9SvsbnqXMh3L5RWhNPh9hSFaWQ5PwxWryjbqjuXEudvTEoJRG67jYXbmhvKqBRh9UsB0A8JWbEACgwcGCIkOKiIFUIMx9bhTgE0BCT2MTr2q4P4rph8PDXCqngI4bY8LZjElgB8lAhzj0KBuEt2+D0dDrAkgfF6RBHiXrqlQzRjRHaGVavbHn3yLYESAbBJUi1/IJ6YNFcLMFtg8Am9N749Dm8z+TAR74TPY5o0W3a175nDt+jOYfipjKdBZTxm0+Mqj70Sc85bme52mv5y7eQ7goo42Zgs5KhJVDQEJKTrdev56vJoRDQtYFNz32mlXP/yW+4Zs/957fz2u9V2/gTE5w8lOfx2mv+y5eH/CjjpmctxX4DOx8IlIyChwcTb+UydI0s4+pZ6qZ1fZiu63I1JRFjC9w98q//h/Lvc9l/TUrF5cbmVPllDlZjpnDB2eDI8Hh3IZGAfD7Wd689rfp905AADYyFcA0AAsA7AaIXIAYNSAzO5hM+FtDVxkSrl8vKEBFgkQDOjDACvEnQNayMdlg5adxxptgokkmG+OoueaZb4GFFuEyWWyJpZZZzh7PCisVK1GqDJ+DSlWq1dhoE4Fam22x1Tbb7bRDnXoNdsXGVrvtsdc+B+x30CGHw8MRxxx3wkmnNGqKhY1OO+uMc867oFlLdIQGrdq069AZE2t16dbTCDjqNTYGYgU3IyFEhAC33HbHXff0RUZwIPSv+x7o98hDIuLs4eljwVAQKhJCKGGEI8cJBUqECODjgBgREqTolSIVAgAIwK/94kGJld9bieGOCgAePF0X9Kt+u1qGc39UJ9NNAWCA5soaWFemkdZsY0Uuz5IVElHnLVqlhKgQKZmBQ3ylKBMoT7lwp6VJ1yYsKnwynZKCUpbNckwnoDdWIS2RNHvUqYsKd3PSJIdCzFLn2U67CHuY3Y511vqjTzJwDu7pD/KJFM4a0Jp+cVshOZTT5KfVhVA5PVtpF+54sBaQlPdnLy8DACkPLA0hhHagVT5qm90VqKPZC5VeyFePm6oeVIExyF5lKj1mUEDLzz5UHM1o5qAsFYKt+NZCGYJ+yjGn5cTdCD5wylYKe7uztyK/rkDdGHFFCs0Ne7Kq8qLx7acVVK9+R7hQKRByWP6/uoyNqTwhbSwh4h/s7CQHjWREdBAMukW8ON/btnN16P3hkRnh2CqyI87Ix4GG8+eH2y1dep0FUOIvW4C/2cMOSIQAiJEwDBAcmfLNA5cjgIN5l6OAwqLLMQBYEhkH1xAP9m42As7ahZhsirmmGavIGDOI7NM9enDjTndYJqTCRHLN5UCjMfW2Cb1vd/ytPOPNiIfNNImaIQhBiKnapttYCmymwDKLAvOpJbO4VjJ73+zNx5psEhF3m1vngwEMb1gFHgYAAAA=) format('woff2'); +} +@font-face{ + font-family: "dm"; + font-weight: normal; + font-style: normal; + unicode-range: U+1B00-218F; + src: local('☺'), + url(data:font/woff2;charset=utf-8;base64,d09GMk9UVE8AAAqQAAsAAAAAEawAAApFAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAADZtbGyAcKgZgAIEsATYCJANCBAYFgXQHIBvmEFGUT1YN4GeCDe6gf6DUKOr9mkaNlN1x2LcZlFdVJ1lAuV4QOsBfGD+AgypceqID4f5/2rT/fD4DNdJm/lSgZ6auaWzXU7GIOyFSswztEhWSWkzt0Mi63sc+0kIubqKaQBrusbvV3x8r/W/jOqzHCyga8Ij2b7VddYFEpUcnSGGLnh1IgS2e8p4ew6mMKfhuV0sUsvt2QMAxIITI+WVXlIZXVlSav3sRDjij3099/DSQxE9D5qcR+1OV+yMnyfLr0WtrK2BHFyaQFBn6k6UFiB1W4nI3hX7skrnQH05qjfyPLzaRJkRYXEFTx8jCzsk9ZHjlpkj//MLsJ9c1bcwrzv+kYEFilrAxO7e+Lj+xKdv4PRIU8xPlyoqZXyiVfvXJ1drCSuTiEJ5mlJtm/Ggf/tJv+N67xlFluaW/lID+DxlQ4ILaP0AHs+bQkkyZxrgAIYOAzhqukDcSRfWoGDWiBr2qUIeapUhpyEWGgGUQL5YMi4AeCBjEazCAS0SR+JKjjD2zh3nAqrLO7ACnz0VxLdyskLpQlrC6iJDIY9FpMRmx1WIC8asWvHZ63fl1o+vXCdlF1fpFTv2viPiVW4/EFNabPNxjT+N1OhC2SHjv/xOcIVr57Z9/K49gKXY+tNuDCd0AFvL7Gqd+AD0RpOF1sAfrSdEanF9FGi2E1AnOL8iVi9ohk1Kw+XE5tNr+ERHOhj6IoJE9AdqPaiMsUWowQEAIVcURmqAADawBu8lcclWDPDRAeUvByAKxXM7K4piq+KwX9ysfZ99Pu52Y3Js/J0+1zNgaLjHBnDtXHt9lbz+9KRCqciM0QQa78UbQINtBhtIoMrctWpHbDf9NGtz2LIlNfpIiKHkQkeUvfviQjX3AuN2XBse7PZPG30i+KyDbbxuFfvDwpkCIz0AeB6pgExdu4+1UUyiv8vlXsLldVBWucHUNC3eVymMS79ASE0+De3FDMfcCgdARvBs0pnYmx+EPEvWP399M20AfRnAjp7FNSpYjEXpCVUXJ6E094rXRK/UgxFpQFLci1jLkJA09S7RtajF6kZAV16bKxY3LmlDxNVmO1qFRijQCj5N8shx74F3nYJc66msUBQeW+fhIdf8yXG+nqIfdKIqCXfj6ZIj9vga/AHwArqseBHlxa4T61HwhhIAhB6IU4KYep64X+/T3FRcX8/gUGxp6+xjSwhjYxJl33OFilf8HQU4T5aIIr65SKKqPKs3T2GSz3Rm0cXLhzW641MZ5NQX7J/KUDwYuqlJ9TKn98Q/9bigAfuSpDVYPUksDQ8vLhj1aWkbG57T6ry4xT7spC8mLkeGc/LaO3JDLlyXBlkwscgG3Awyop8vrFI4jUw6YwG0ueh80qU+V/tXhDYqGgupJpXvX1ouGH+ia6XRoz2uPGgtKuPk7wPNxarRnfqljyWzxgzcX+9xNt07qFbgo7MPt/V0+1RMIv0RHOQXoZy76Aj2g1N14tdoWdSMeTxijhGsTv06ARMmZ/oGbD8+oPgKfK/huL+wQAwn85Z+ymUOpPMPbn5rriU8ZD8P6MAZCiLgJHr5DxGbFZueI0tk0WRoNMQE3M5qAHbmDn38lHu2yvpzNfP+S923gvrkdYizeizfga/6MJfHAgNdCxMawMdEiqSxSFkU7EEL4Dm1h2zVBNPj3IDgMkvXPVbmUBXDUWwbRlr+18WCFP491ee3EFLi/B14Z5JNSdPUC3++c6wchdsGNLnM+ywJhTBVUly/Br9g8Ag4OsRGasLsTDjcGNYCoIbSJnNsBv89QVl3u88ruirZa/xLTHJqcba/Iyi+rEjXbl140cPK1dHnuWR1Fk8od5hJfL2cRzoH1FCqEdy4F6pd4lfcIPaTiHFrji5NScraRX+7AZZ3wqxlfGuIrsbojQNfUY1SXtDjAX+TnLTUz9c8qKigq6Miin8IZipxtL38lCQwMC3aSMk9lPIfMkBcvRa9fK6ro1K/9RQ38p7G4Evg1miDrsewrdSlW5UgSQ6XcZYZcdVJalWOe3xGtvobiKPcXZdIB8fJwaW+1X4lbKkO+lVdk5BeVinoulF10sI8OjqFfB/ByU18mlImrKu44u/uHXGNC3fIJclU+XBzq6Ox329JaklQQzCTghT0jEv7EhymC+ZBOR5OrTtlgSJFjdrI0jW7qyTr4X8RcTX9d4SPvqq4qfHco2eCtzLv7cR1+pebG2xYmTDoueIRER99mUr7jgVYd9iUisQGvjchIeZVWIm56GWR2zSvUzDMwKT+I8ffj+VVUR9eK8VrsTJX7susFD19QxzIbky/hLkW+PRRzIKpeUzkArqQqUhUV/1PwVcU76ldZUA7gwY+WREkiJQIjg+ufHBINEBMFvV/9TecoshUZeYJ6Pr6pnj5EaokuaR/3KvdPcVs6V5DGChcObH5DDMCJ3IFMZc7OwrNVOwVviHYwaQFjcBVpEVg7FDOueA/t8WfwUtSfgrOEdLe/tttugQfsCQUGtHGvc23Axo3YhIZS1QHqTBefCTfPYLvN9iMLWaRmQVuzfgXtXiHLYdvmN0TN3y2wXfF3zs5mvD0pYjKo1q5FpIYUfx7XEPTDFm7MlvFjPgjWv4Z1S80o+FxEltea1Wqyppv/6lxaWG0neXvHf31GdYiCJ8DYz+JIrMXjXOUrlTpWiSOmXqeq0cNMQdvPB1dN77+PfOgEmChDlKv4fIED5itcsIQx6t4nYe+bnBXoIbMJ/I7ftZy/lLgggMsXS0tdJT5j5Z+JMtj02AwaLtjVjAm5n1wUG6ddTkpsYrzol4r64eb2YKfXdFsLr8/3crm2GG8w0McMI5FJJKK9l1yhwCVkEdItQrTDgtkfaCeOVqYpA1egdUUTdqyQfrADdClShnVX+N3D1AVWiVk+aTcCtSu4FlrhjxX1BXWhkR5fGJOuepPxQQaWZBDYJY4fk46WWLDIEOAANoXVUOyG+CmJG1NTU4d+7ZaUPLU4OTs1PbdZUuqXkaIWIDSrBAJAHJp4xC8gLA6i4ABAICBoGGMDRpjhSZdl11VAFQmIzTH7yRSEdEusJHMdQMrgZDNsSoBwogtAx8VxftoZ7Lb/AGjxiJgXMJvrMy6sJJfGyWYqYNmKOZfU6hyW3Hiy1sNJ8/1UkapTnupQA2pFXagf3Ux3a+toG2hnpHYKflSBqlJNSjHgTN5ESH2rzz7wHGExQwvPCp5+WlK6QMATDlSl1aJJgzq1EPYKPBBEJr3tNbktU8Ptm4Dl9ruARy1nzulTdR85Jlj9/0PsMHeTl4DZv1OxZ/4DxsSjEHKCVAMCsTHCzCGGuVxqSeIn3uIgtpcwufJzhNN0FK/BU6XG2Pk4Mq9PkiAaooACRJCCBeHEALTAsikBhxZTBgw6TFkAXaU5nI8Q1MO4wusc3VelKs1qFCtUpA61A5QNK9acXmEJ77pROVg3KqryKFaWpy3zF7lK1dnT1qtggfKuMSLXVrm1Fna+Dvnshsotj4WETnECOyXUbLFKFShrrlWmwQI9CVbDRgEAAA==) format('woff2'); +} +@font-face{ + font-family: "dm"; + font-weight: normal; + font-style: normal; + unicode-range: U+2190-21FF; + src: local('☺'), + url(data:font/woff2;charset=utf-8;base64,d09GMk9UVE8AAAUAAAsAAAAAB0AAAAS1AAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAADYgTGyAcKgZgADQBNgIkAxgEBgWBdAcgG3wGEZWcfyE+EtMN9vKA5Bc0yUoz5//3zdWt7i7MMhk89Jh5YNQyuHYEC0QOBNNZU+uBFf/fX3vNFsIYcE02IK5Pv98Efw+vn0T3zFEboYh4FE8cSTQSMomQ8Ug0TyqZBNtCn6ltmWJarrMX4mswKBIYY7H7bxbmnSgqLFpx8yIZ/cA9xgIYCyNgLJKMxbKRoIjYCNlyHxrXrZvx+abPHxHcKkZSgdCXUgHlHqe3oz//PyeO3o9FFGNhscnzV2zcsf9YxvXRZ1+7WVpaVOU31l5RbOUBkheyMMmlLEZKVK6tsKIwZ+2aLRsVZRNls3HL+pRnzcrVfUmk4lcW376y5PCf7C3gZLvzqq35t/M+nUD5ywykaFPZH9GRJYNONVRVTftcgLGWUIRLYJCwSaGn8CfbxwbYj5JJypDaol6kJ31kizO5PqQxs+bUnM4k56Br0Gkht7C9/jE3vSeO0BJupOuTZ4LoOC298SmP8tF9cfSWHk8L6T4zLaXjfPKCbpwNpiXiyMevCxu5WT2DTtfw9clOp6Y504RNeHjUi8YuUazT0k/jTC76zsgym27QcbGUT545YKQHiSNiCTe5Xk8nN9n4YN9gb19Sn6a9FuFRvaI6Ko0/qhi9pcv0xqB5ibQ4a3Jf7+BgXxrZyJ3+ulgijnDj7IHJC8FiKR0/dYNH/UqVVt1FR+JMrtdjiDg0F2uofsDBPCq2p+4WkcQ4/Rc0CYQ8BEJHaTsf6B/qSenV+i0mVZygWm+cwgtQpJC4kChANpOaHAGRNzFs+v1rfGz+9AbNpvl8qi5YSM0pQrJV3ORaf29oWT1DA/2Ila5ZzAIHJrSkNB8rKfm9uOm9AUf9kDPZ6dI0R1rUF8YxRkHGSZlyab65jFykC/2tt8U39C3Zy7vHxdiD3N3scyc7HF63O6211dvU1OhvqE96WIxzBzb42zglGy8bF8z+Hn93V1Knt7OlzRJwO/wO1v8WY2Xde8n+gPhGfPvWRdJJLxOucWcdzTC2mB9UWrFUcwkjRWoEg6dhL7EO5IEOXyC5v98fCKS1t/s73O/Y4+22lNA4D/tSwcTNDd76pqRGf3Nbi8Ud6Pf2M2uo+5MN48JeTt/Qt5feFrrQHyTX3j4epQ1O/Da0a0hUDgWJWx3B2qDxh49Ezg2FiEJfny9UD/swXA8EAi9OzIuI+LCvvSvQ0f1QROTE4lmpAO8ygwEIQwoLtv/g8TMIgQIAMzMAABD2QotJtUXl3ywvRAIQIlGRnmMB8EkROUL6DnCpsewhrM059FDtivfmBOYMyI9OE1B7WOhuBbs2gbRVjgCgUgBIAGQAs8hgClGgQgUw35zPzEgw+x8lEC2E3C71ywOzS65Hb/83RFaeA4Bfaz/+dHJy/dvvzkxNU+hLyj2ADIkxRf54muAPswIYM39VAEN/eyZ1I0OSc9Y7J0auUtmylYIVwgEE4xIZTAkFUAuzXAYFtVwJEhxcGYCrXIFxOZK49EEsm+9TpFiNUjmyZCunehyqtVZbY3NNM7KRV3UL6lFZbUCOfLVsvkm35SmXrUKhlVR75ANKHNWXEbR3JuEdbaX12qwcQKftHPKwMjmKFFKtsdJqwZQBMWaGrQUAAAA=) format('woff2'); +} +@font-face{ + font-family: "dm"; + font-weight: normal; + font-style: normal; + unicode-range: U+F8FF-10FFFF; + src: local('☺'), + url(data:font/woff2;charset=utf-8;base64,d09GMk9UVE8AAAJ0AAsAAAAAA/wAAAIrAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAADYIRGyAcKgZgAAQBNgIkAwQEBgWBdAcgGzYDQC4D7AZ5j5pUbZm4E4qID3FTshVsW6OPY3WC/+/H2rnvv11ENDEkTQxJq+WNZLJJVAvFm3aTBlvR0kSNfr+2Z6qhKJBM0ol5Y6ja8JLIZEphKPV9HOFkV4XMJnQRxziiAWnTbbybh5rcbQEVaSD4FH+CudcDRKD1sl994/9u8i2QA733nQ5oNDZoniWWYBj3EluDplDkEWft9AMzuudgMIgSdgg3JGVZ6nd2Bv1B6OwinfywmQofJcRH0ykzg9Iyv2VCRnk43/6X4PXUhMnvZ0DScRl4PMKMu/dGkVLCpfSOfGPKU4mFo9Fou1nWW8n6duj0Rd30ilt5v1vpvG4gDg9rRU4mn5ajexpYLbO6SVMBJFn6yMPnuxP+npv9XR6Oexf1IXJq7ndftzNJD8lJ+eLN4tGnQyi82LheWVxa3UxN+/QqTSCykoeAZEqVU/vW9jG4OMD/A8AQVapZPXFvadqHlofodtvXIiw8pNo9iKFmtUk8TuFR5f/AKby9MHb+CBB2lNQMrABYE0KgkgMgQNB3by5Ifm74PoDHdQc8nr2e/PsjvPM4CBgAgnD1R3CR3f/+5wka5ppslUPoBgEJxDCQJwFDoIFwhsAwxiAAE0VnpRGKc8JoieMZA0MrYy0NTVNWe6wa1KnXOAqs47ZKPVasujW1dOXLrh8VHVNNM31hlVp0dVnrDCe6X0Oolp+rqQoHNO6f6u1jEy0DfVb1atQJDyDxv8QBAA==) format('woff2'); +} +@font-face{ + font-family: "dm"; + font-weight: normal; + font-style: italic; + unicode-range: U+0000-007F; + src: local('☺'), + url(data:font/woff2;charset=utf-8;base64,d09GMk9UVE8AACPgAAwAAAAAMhgAACOTAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAADdVsGigbIByEAAZgAEQBNgIkA4N8BAYFgXQHIBtHMVGUcl6YiKrNg4AvDszzN8WEynDICJ/38/YhJWU4TgPAs3FMEIo6bCIl5YQjGSHJ7PC0uX/vuAKOjlWIsGqW9XUyZJFlZKL9c0b0ZkXO/YIgud1/FQTeFnjQVmFkaWRhoM0TCQMNNMD+//85v5WP03W/3szQCUPueevuijhaSeu4BbUkaIJYDG2QCBDU60bN5IQ5PP//ybb+f3+8Z/qnWmtZcxlfFw8wJxgHQ8KQ0QwYMmZMYGiwDbFNmCLGiDoiEwBTbCE3jG3AkCfuW30KvmHbPjR0/SOeEjQi270Yan+vU9f/pBBIRV8nxtsJx+H7S4ZvBWSX5BI6DtspsBWeOyEOcyfFZaWYcAFxmNph64q8L10Jtg7zWB7ql87+57JKRAhf6uZuk4AKqjWHY1DICInvxoLEoWOIzoqg+7thjLH/YzifqWAPEHXPMkBoAhiQG82XsVmXtXyvdZuEH3jMzyDwc6LAzxn5GUr7KUXHU8g/PIqmtoLxvbfNvVt2gSL+qeZVKgDoz99PscFnnNPjsnS/+IN+HnBIAJE47PlKZFZtjDk22qUfvOCDOJRBbxgBh8MmOAEugBvgIdgPH8KPCI4oEB8kDElCpiHLkHUtCb2gEI3uaH5emmgUnNPbn6uqe8sFPyBKWsiiGjyPUV97E+GKMOKPlAjFrggVvamvql+SIMGkgmWGEQhPwkq1ynKlTofvr0IYyGdnKqDa9yTu35xetr7grLCE2Ege1IAsXJmiU4dlcZG5F9VdJFdLEi1AtVFAEhK2wSBAhOkWh8noCxZDs0SXRVmJkikek01AiJRNuCMzZOJXlevPZlK8FYNOn3yo321Jirp0nJh8XTzS77tk+iTH39gQWwlXozzu/Ot8CL/Y6DaAWmD/lSYAxmQIYA3KAfoAuoHIFk5VgtEJIASOQMowIRhwC6JBNiInEBstgNZC20p7RvuG6ugQdBp6AONh9dgOnMTH4r2EmphD3CD9SRPZRa4g/6cn0fcypAwLw8q4ykSZw5lLmGcpjPKlsqgN1BWWM6uK9YAdxd7APsshOe6cOZyTnA9cA9fst/grufu5V7hveYXBCT6Hv5T/XW+Pvn4bFf+JD/yXRTP47wPnW9757n++m/s/Je1gtd/qb1k3wWTBg3/9P+vGJuNKhsO4F/Rd0Pi/K1St/j2D04I7cJkhibAHckL/OzR2Smbo26pG1V5Td0yzVQurNnHa8Wl+h67Tt06/V8NRs14tWavbX0H+Gk3TnOBVTxjvTbRAPqbyeEzV48gbFMhixTtDskQ1msRZAyd4IpoNckyBUTyK09pMo9Wgnx+pSjeJsB4vShaLCkEzrYk8mPAtQJnUCLlVf4w4HXhe+9iqGg3nK4KdXfCKam6siryOmTkbsaJyrjqsxxEHmKR+3I2aRAlR8mngOaCiK5zSfEtbPeh7eoSZ4uVXyUFt99BQZZhoVbk4d2pnVe3wpGZLWtwJ5VNXnilaV49IhHCPDKEv1PAvrNYODn0l43VFbcAZkWgBh1Is9EV49T4n38qTm8emFi1j9KPyXaiEtt9HcNhDyDLoHMB7pVPkYHAWtgq995BiX+5mdm6JuFjMFexAm45WdnSwnzsc36Dh9G2796MfaPEwxIJESuvIro7mt/p8mhLcnx6N2RhizBwWID2Kn4XfxYOki8lCtn4nt5ULuGVmnuzVkjB0lC16nHHLyGnJN5DwK+kwvebz1HfzvLQely+5EaaemEsFt5Qvlq/y8t/kDf3Jpknk1oXtfvG8F3Kj0pgFhtEx47nbV4xZoh1SRwfgTlXo+DuhOb2mfZEtCymS6YXfA/U8Pn3hYvD+FFNSc+KdY1ZThuMHkvp7R3sOd532f1BlulOcV0wIw4mYNIWAu/IUVbm6nMtOObLnLH7wYFBHE9xIR5/j296uazDeIpyRUTvch+xDmkRttwl1vFjDz00/e+4qd27XkQ1Xmttjo72TfXH3oP23CWvEHngEgGRVKLNdV6ugISS6ODMBnwLUJrNcjjglOIN1LOMonuJgfRazQbIgODsDNQMx/hPoDoz+DE+EwRDsvQmF/E5D+5cDHDZksWckHR1H6d3qMpeJSzFW0EFQBvY16HMx5QdEsQN0ds38I7FVagFf5a2PIuEqTd6MAXjxqs7DCysgRPCthPvsHIf01FYyVOijNlREUw9mZsQ6EVE7of0xsDDyupv4Vtbbk21YIXVXGvUVHtLbNxNfAT/yltmfYkIdkeTAzj5td/OESGN3LLGhC7wAGJgjryWNtP4c+N/NmYK0Cl4ku90thtXZallFXoJn2NY8GaNhiYenluCBtIhIRRC/KzasBihg8gz+GJN4dMRkMqv4CbIozf54OY1V/lLbelMaoB6b96X1oCL8xjj10B8hNGe+6XPflouFkn0KGIUcbRDiFkoQ0AY4y7iVhXyfS4M/9ZWrGhVNJhbOL3EHgel9KYFNHN+2tkh8UPlinVOdmcmyoS1f4rDh0c3DYk5zm1bbm0AVS6v2l7WUmdTlVhxGX0yzCKfY3hsCVswrXFPl+ZabPQOj66w7VdhImts7Liq0qV/VgJxPeRzO7uzL9jEhVnMkaRKxVLa0ippP27zI0hDOYhPEUMLT8SUim5lVW41cEsVljRX5N2/vv4S9l+NdilSFyiLQnAU44cHWAh7GmY4yo/OS/gguvha2nTKb96ZYyGjxcfrT6e9TsQ1rddtFaeFjm25EhvSKRM1OXw0GasALTB/0vrg7A47Ab5ePPHzlANniKTM7NJ7hHMVpbpXKTd6Jz7/GlMmj6RTho7OY4ghHhbcTRtu85X3lhiqdoAklsLPgGMIllSeaEl+whi9aITRnieIWIN1jg0nJ5Pn2443n8LtffGYxiGIWS3v5t58MDenqjCLvPJ5UYCASBCJggKBdCfO6eKaEpcINzBG4QGoelpJaWIpwG8NgLpoK/wXtc/IRAyZk8IeyVTFr20WKQ0WQsh/Bfv/nL2AZF/bnZbXBjbvi8HhHvRPmUFawkFNnwlTaKFbNV2Zks6wpDYNDMKuwTm69gbIuPd5zB7+4HtWCyTr1GWk2cOzNDnWNugY00+ypDF/TKhcLkfIj3UXTeRibuGsM3W5/T5ewCzHIQg6qtz200py5pt99G+LVffnDhR2XpgYzOq1xQ2Y5mHrTDyScIH4NbdFYX7UT3LuLVz5+dnyJzKzJ4gbfs3ANB39ulO9npW12Z6tFxM6OGND14Osbz6sD3K63/J7t0QcusEVZGY/OpyXtsO7l6JDAaQsLUZGraCU2pTRzZdnNXvUnF4YVxcAg0dpoOyNswNlXQF/+M/BfIbynPFbOGjqfc8NABFQQIfUIrC3rJyPXPGUg+KGUtZzIFGs5mFXethC9zDk7Osot80Fsn7igPlPGuECtPgm0XeBhPo9y9SgMaSzyrXDFnbhbsCLFxR80TWnAq4w68DoQnp5KtI1kehLSM08Yn78/fZpZQxs3l2k1Ncw5JoNJcqzKEhvpGT01zjN023DjF9UthYNbA2lqiVOUenKVMEZhEo9i7YTqs31W2QHA8I9MWoKPPC7jPj2JAtgQnPhQaAjzwM5jUwfUCGlJaGEkUmgOF9sBhHC4D8pUVJN6LUdZ4xTns41DjtgqL4zb/gbJKkV9Lxwo7aMA1S8ClLOyyNCufn5iitvLaN+Hw0OPT9pf3b5zB5jj99CrlGKs6lhhKJfhh66nP2kdwy+vJAvX9g3aFgpcZXmKO0L56C7M/ctM50JZkCkaZM/4Y+A/HCQiS8Wi+n2GCyWCnZQljsSH64Iwa7tTPfOZeSvi7eV+/eAQrAhHPnqsDgsM6fSSnfhw14lqHsvMY0z8lR6MReqtPNMlFzfBVDpZTJyemBOJe0btf/H0DKH9qdHcptU64VGj3Ba665WlPur7QLzAbNJdTbRe/EaRplUwk3igPFUoiamQOybpo/rooOAK0tw+cshH2nw2VaASpOfk/rMPHQARl1gdzEZij3KrwDhCAIRf4PW1PLPrYWAci+TnDPGmWQ0nzwhpQT+iTTDD+iFv4pCAPb3NnNmA6T/0pYzK/7bUeG1qNBJk3/QlA9YajaJvQQi3RziKiMc3Y3po4cNB9GZhrSLGDO6oytfZzEMfsAphkIL37P0p0Jq9cj/jgeWTYKgvAu4EbtpiLhwLnDPOLrSqgZPok9iUtRWQ2wxOIfZMOZo/tT8AJFj5C2QV9aJJCvS+C68nE9hFu32IkXINkwjmSz8rNs9Wha17GLfmVnZhBeSy0FPNS9ZU6oVM4gJ7AuI9n5M+6hz6gyxMi/C6rwu9NvUFf0iepx/6NxJLZluHM82oF3tJHGo+xS4U3k5L3XEQcu1SKR3Y9tJig+cxyue4lpOL68ANviCAI+4YaG41pNNNt5morLuaLJ+R3v4SngvcJ921nT4XH2nVFqVWy27X5F3i9FCozTwRTp0sQvB9ylEEQUSjw/agRpxs9f7ylnITs8E0vwFmCgU0tMOD9H4pORLBhRgrF4rlNwGB6h8bFphWuUiIcJnqWTtF10WuUbONFj3bj0RBn11o4Kl3EX8p4PTPFcB9IR/VvvWm6ubKRoHmNte4VTzQAn8lay4MkwY+IyAm/lXxlVY5MLLGwi30qULD11KP8jVCc3tU4+bKenvzQg+fbQS1aRYtSXdNCftw6iNGu8lOP82u15ZgmXA/048ycZzxfz2nTO8LyHIgY1Y5rOIjvT3xNK2iIbVlJfM5e6B9/N3ByJ5FlPYWoYKnsaOnkCHvsTxFrjLSHhAlXhczjJi0XvEw9Uo0Ic1ZYjrn2LCmKjukYjhi6jXA+kx7uHOUwUaWsCtPiwHeMPgWOyLa2PuIz8zRHzFOWnD15ad/Br4xgF8J5al9HSVjHOYxZWjoJ6o9Qwr6ZRkhGYoMknQMoE51+cLeBo8mPTO5xfQRtppJ8q8dw+csgKrzdvpTwmAtDT6UtE457kc8ZK3kzxDKQkuimDJKuWY40LCHhG8iPDikk/H22z0cQTuvUg1YE+wJyk2m8lhMoJnd59Za2H6x5UrNGNOYXH2zRVErO3ZLv23FcLvmRr+bHZ7RYUsf3UG/KKO9e1RqA2yUVeRU5hgDrS4uurhwqYj5Qpsvhb8f+y75lKxX1vNnQ+6F/URkz2WGx2ES7pS7gfqovVDoDzLHXFv2W9nm7N34twYC4PgW4SB7pFP8QAhjqcP2XmV/H7D/XDZ8TmHdQjo8fwQddaA6NkTS6GrPukE+m8tPTk+1dzBZLYqCqIGGA1JOozhZISCjZFW5WTy721HVCDtwsympqJrTY1a4eVjv5CPoSxnkRwA0dZE+NodY50StROuYRUaj3oe+rd6rgqhzh5dmXRQsn3mlKQINzQ2Ze1xRWXeRxsDnpLc3K02raEx8rVwRYSpEM7pG/0JAUe5ZltC9v69CeeX3tKqHqTJ1qt3X6XqNTBCKd4GU2MNoGge05Mdd2XH5UNFgUY+EEMt/Uh9hWtK0dlNtGnDQEuL4M3l8ggTY1R69cFo0cUKwlWbatXKraeVHFycQ5RiHhOLD4iOxXgnxp9KKpsQtfJ4FOC8aEXwse95o8Zvob4/34JZcO2k43xI+SsdtXFIjRRPiRTzITTEoWmnN3sALjwWq33RjFKLfiD8EdLTdgxghVGLtRbedXBFCMYmreK1gD5IqRx0QN/EfQmNLkdlXbANQsQEAbJFYSjuLEIcRnMw+gTDaWYx4KTUlRkCkFFJOZO6e9WZFxOOL9KY/PJwpfSo6QC01rfKII6m191HsWN4XnJzcxiu9lDR1wc2Dm7ogXxPcYADJBvGK2BNLRBh1l8cWZ8Lu8YDF0hqQ3bujBkgz3VFpw0ysCFD1prs8f4wWw2+Ag5q7+yMSzAtzsxItLj7d9Qx7X257qR8h2538R9mpgydAn1wULFdidrVBrpsb5hlVwiiphtBP3kvsuWeVwU6MMJNjZgFnxL9mEl4Ga3BIQrr3+eeviocZwXMuwww6UnQbvitsA2bY+xlk+yBqmWS1L55a+3zyvG7Lf/CVm0t0SgOjX40aPiS9ZMALIpDIOAeIObOJMNnRaNlutIMH3EafCKMUSye/5N+9chl/PVKXpbOAZTOXhjM0OvkOXUkJ0TK3EzWO21QhEDkzdme6Iw4bMfiSCN4iA5zMk9Ge+Up5qQy+LhNrN6mrq9h3a2w8NQ02spJzOJHtStrB9eLt+uPXEtxX/bK7SrY8c/dh0/vt7/X3mOnkP7qROngK4oqNaQfwwa7Khj5YMYz0vyYtLigtxIlFTQe0AIH6svFARYu2UdCYunOfyJ/Tt+gYJkyIGPuYZYQdfddUevDsMzCaH7o9PBQMQMf+1NSiHB7OOqXvt7PnL4aMviB6d3RL98a1+t75Z1lIFsERJVuIvqzTC2aygenswZnxwJ4J7xjAzWABLksgB/O/fzAeXfPzns0pSxeuhOQ8lTIfNC7yj65QKt6ZexaQQsVyyPV4EonfZDzlOBkd7GB4vMFn8D3QQfpyqmdsBgcdbXiEeA0h/do1EvfQONYXGZhVkpmfCccz0akz9UMDrL7qpvuvzpTXgLu4fgBXs9skxAU+HPGjdStHg37TogNL33Zdq/+9DkiwxlUFNszjKgkKnAJojsJHNWOjqbwIP99NyNla3Jsc27PGYFem1WLDVGDfqXcZhZNx/9GBkXMwh5IifMHKAfZG5+BDA5lQdxZqlPYupMxoov6SgKnPHn0wcrNiurbCZBoqBo7YFCHEtQbdA5IaoqUmmOfnk6zENqmFGvVaL7wZ6KNCxRG3Tp655TWKL26krNkhfegV8ZFW769oKTftI11V4HE8jajZyejimmPXB/S3N7FV6IXhKUhzem1Yp6bJ5en5vEuZydUnUEa4o0b2sGefMh8XnOPmdqF+cR5RXjg2tfPynZPfNjyD3Yd1R+uO0ByX1UJHBAuMMq3VNFoJaZUNGXcRjClqaG2sFVcW2lS02djgYTKZxDPBLI6UixwRCKpnTRItFkR4u+CYuIHTWrVWUwsxYrRyniIjm+1w1OPxb2+Afl0dXDvciRqIp3h+z3Yr0Ecn4CYSzo7KDuFjHeW722FPaF2gNph5RatOi8cm3LMXZJx4/NHxV/IH5g3zyY6M4H4mU8NT0E7+JNPbRocv4dJz27c6xm/zEUHM+Z6/2m9jFvXhP0i+GtUbwKxwZiuy8MgIY3uiav194L635SBebnMb0Q8ZBgyC1RJlhjKHabaq1/+p8fXY44KLkNCF+k1wi/LE0eLOgxpYW69preglPq+AAt8Pix2Re+1TIrBrEoua8QsungQjkgLSAqUMasGTMJ4aI2Nz8OGGBIc8WED5rWrWbGPX8Iei2N1bWXepLV+rlnfwJTNZ5XJnU1M3vqSPCCqEDhMBooQ8ymGdVq1V7xacMgyPDOIzusigAhhBW8SSmHzvaDaqPb2992B7YyvnHEQrl2UWyrArr0IHG2/qbunuVnZV9Vf2M3r7byvjnjcxxoZmBsbChZz4Fj88Pik7JAE2jaIwokTllnN+R+Wh5/e2fl8h4s6xs0+fQ1r1hJABUM6OhB9Nrt3ZOXtlLz9P1uvK2xq6a+oEikilclWux4a9Tj6ygMCISBiafmQnHlGDI/O+e/kawDFk4y1+s9gLgGBmIwTuLzJcKcLsKA5lZ/lnXWhFUEufT35SxBZ5hUwWEy4dllwR0owATkec3LCetqPth0BKD9LPpwQxhbCyfIVAqrom9SdfcoU2HIM5YbOX1+7oO3wlOZmzE0hejPpHnro9w7wXzi/0PKx28h9dF7blnEvxxYMq3XoiBCWnY/uVQ01fBZ/VEhGhl1hD6O8L5vTZu8JgR6aLWJgUMfEJywk7/AZdK+gsjKfZomv0jrD1L4iyHWmmOA+5e/qX7Y+4lJ4oQtxP+3zhsXGxTZpRC0+fCr39iNhFH4R7rIjb6glrK1UV5ezZ1MFkHRc2BR2d3yiLYpcqCxT5nJZUhDcIsCkzbDJqr/z8CfbIGNqwrqxSgxmhKA6szS4M2lzoXuXetaeE5gFvU1KVrNeP+aVDh38EO9Q70m5CeHVTLDfGAc07d1WWjj+AuvlHf2xxVA7P3IffTMoGJzVF/okv6IwmsseKrXaOxhmuL98LGMa78BNVqkp1FRQeILzjRByM1L5UnXjam9d4yzjGtKBhXKrq7LbmzO97W81LbJJa3CjrWqZvIdOc/If5507uAz0u215wCFRIn9QApbXl2fZ2xzbPyEhH3DD6xYd9cA/3RBPqx6cFe0wfYyH4OmSYOYHtaoxcVhdEk+amWQmtVUJAEPzSfPA249MwARscti8/aOBFTzVKkXGKFNnDaZadrmFU1ZM62xN+H/PipzyM4tvEu6xhg335jdLTZxwvNr3WfebCKbnhimXNHLpr6qbpW6YHa2514YB8JvOC+QubxQdrXY31GIdDolbr0VzPKOpF/cbdeB838OYPF5UEiS2NS+QqvU5Zh723Lj0/ChCjAWE48ysR2BvQlOZ1OLUuR9pHwzOOpAE6mhEL7/LcLKoPXpCQlZiVxERNDov0w8sKVZpC0NQUD6+oM1d5Sor0Ctl1F1lDm9geV8hSWJHKoKe/3NroNWf1gtjNJ+5DsFYuV+ViRqsNo9yQkbpIP1YfNSfWZ2rwUmvxfm94RL4yX8aQtGJUqnwjs3qxKi+PbVw48kuw7tJqVWotbLqMVifLlXI8NMxw+Oqpq7r3sK5eU1fH3piT4aTlPu3tfvLy19wDA2xB9tGxulqlopbT2OJW/b/chjS2aJtW45q0JdNwzn0l8U4X//3KGFMQ9GZ+2bc+/evAJDNX6n/N3F6nrsP30pPsZ4P+1hWZHJDlVZxVnKnIyN1sdL+dZNe/hi4SbcrbmL3wcqBkCxO9MCjTGy+SqzVySNqG1mzVqrT4S9mPb7tgvdb8qLU1NyMP1b/Q7F7sGnN+e9fQB7znIdLfMn3FnSNVsL4u+6ll9L+WZ7QN1O/rwDPGNV22Y+cxyjRccpVSDhpavfseWjxVy/XUyT5MU26ym1OIXQxkxorrF4a6BvDxBmmmHvTU2K7O+JNG6e2HmWdg4AEUMyU0IgzPyGhoTQPdd0ic0O2KtdAxDr7X798N9y5tdqN37EAvlC0PldCrhA8rm2/KBpXmLfgW1WyRKokgfHyBCFJhy+nd52oul9Vr6sprmbSdnc1v8e6HSP/6z38tTbtRC1vOVl+sOc+Ae0DYTbmG5UDJ3RHaiOXlzwQvIHwMhNx8JryUcyHPJhmWBebaLNooX7s/5HbS90lxjjIHpggbC70U8tI8RQEzwplQ4oS10NYI6neh1bwRAPC1bmn6Enz9FeWyNdk2cdQQl/cpYrOfZkZjjI+7TFOOW2rcZLMZnuhsPTOOKPmONxMdCtuRw/93Z6woF6AQKIGZYuXCF0rjyNi+9SkSaXKRQqbI4Vzbcp3yXJlV8xVZOWzT2/H/qFUSbXkYu9PNfPbfEMiZtxyIuRFTKTcRljjKduQ6M50XSUSUbrisxZsDaSKAHJPk9pu6B9X7meKdo9qb2EvsVv5MNKS+8eX7q7UieesxFQ5tZv0etb6evTnnrIuW06/Rt0f3mCmTmBXF4dhWlArSuCPLqbcV/yNdi7ytZUGAk5Ux/sYd7rXm/jArFuxfFbftwzbWYlDLA3u74w+mwfUiSIiAeZG7X/JDW7CKvsD0QW8uZ6vPrN6KaaEI9ELgtQVVOqdIue8hqviD5keIJBbIedDp/53oKTR1sPThJ+9HZa1vuqmEkUFmHmkcS+otRwfRAYKnRnKsnA87s1ooArwQem1QIUCaa4k96p2wVSHYsOMNfYMNw6xBbyjKV1JyhPCzqaRH3V6ogkda6H4JCUrYboDpfxgAJgKlEhAjrLw7DUoECXVtn8baSH8O6ASbrKGmZ39kXKA425ntXDZGqOVUfSwhLCPdigxaEH0MAK1Umh7UvlpmNBDohD92M1WFhjyfhPnEXrt8Pz35yI6e3EQ3vAMzFrNQZpDQvAjDZ+pST9q5NMymznUPXyZpoVjt5fn3YLXk82z5dYvZE8u3v6Evsby7kCzkkaLCgVwR5Tliv4RYbWp4BdHtgCDK9oVuo59RENqNbqNINB1/6pj3yBkhDBBvUmVOCvBz2UUu8UQVBF4X4R++JV24RHzdI9VveMj08qHxEF8F43mQqBxg/tNJzqHCtsUD4JvOuLz7zcQO3Ekl1CPSz2FpuYwvtMvGFHG9fKc2qkJeKSruBaqtmNfEWiq0oEo/m1Y3TzU4+U8f83xUCrWedniRtYFwAxUx1y0//jV5ifCv28gPEv9GrzvI/5Q4hLkyB7aT822gqd4jqU2s53FtrE5UsYUJoSkS8/v3MSb8+wMgxQu5YIlq4mWqH8t2v5mowwxhTeB9UjXBrJzYbNVzmdBCC/SEof4TAUCxTOIxL396lRfAIvyOtN2LKhwJFzFy+HXLm5cLmtf2v1Ymab2mDRFC3Q6ZBQR/m+IxvSflj+AoY4pMKQdxelfroUpIrxK3+uPh8SkBvE5GrkOUxq68ZuS9vIAgkT2ETXg5U3hvVK+2FooVHh4S/SMC8HYZI0fhyW4ONYoAlgF5n53V0qjH5rEL9ELqTMOOvyu5Z2N+O3a+yoBbZB3qKyEC0g3lJEVDGIoOM6frWY/VNWkdrZDcGKeO+qUUaOL9K0TlrOHnPWbqowiq5CTW1Ttiq+KwYe8b+ggbBlkDbyiCeDphyLuRHiuMl89fIj5f5f/dL4LEECruUbsXCvKvperK1l1FI3eiWz5Ihcv4J4UXp3qXZ66oUkJ7XiHY7SXEa3g6gPVJp+CgOhtbxfnVI7Jt88FM8aHziU3l3J5nHrKgqUE8n+KbcXc0u9P7y9TgKHokekPQy3tEYCbDWWp5neaEQ7fmrCnHslAsVbuHBFgzOE9m4VdxN+lC8ma2QaTb6gXcUjNH9miJHzzOenMybhu4MtmchnNkCb3HdgEDNWSICebdJ62f5o9KTcr3zsDmp1QnlccBz6RiLBetqA2j8WygopWt6nxOTb+n5rxT1BlRRngV3Avs3NY6WrNGpVCywwpSHCndyL8444OnVYSdEl5+SZjmxsRF5w6RRDb8oWX/J7dYAz+Y4BtzlrPZn7Xwj3QmGvZjE/sKzblr/N5u+920dfck1l5hCtG97OeT6cRzN/lPrXvxdPplxh7m5Tlp7wZ3/C+1Z8nCxXPm/kmxBl2FKgCwQwQgACiL9AhUoRLsLFPlfI0uvkcPcVswg5IaXGLrlpD6OPBVWcQH0U09H2aIi70VXMsZi0tA8iMQDqJlWAumi8hNZKbEfkS7FMKMhSF705CRD+f5P7QQCYYesNnzEyHRMLkaTbLEH15ABFJQDBA4tlpvU0urBkIroXmRcm3B3VDI8qE9lr2G+jpcdupi7eDQRnmZthZW66R04OEsnVCshAqqyrS8SKo1le5P6UrHRxO0AlfqSp/TTiOGPNryi3ek5hhpQ9jycm5kh4pV5YStlKpdkRRDmZRniy/1k8gmVLKmxYiPUETzb8qu+4IjsLBx8fCpaGiFyZUHL1tKq1ToEAByCUwpHFSAKeJUySZES8AI0ReF58FBCxoJzV44B2C4vYJCLgHA3Bg8gVdkqR96hEDLgbFco4pN/5OC3QvSIwCYT5SOkUafWCOhV+PHOEvC1BIzj9JTMvMIkMwy6rddQdrv8P9uexljD59/Pb9vYzq8kO/2AHQL6vJu58lz5p/GHjliS/7/h/4QDasj3yA+n9Du/wHUTbpiYDs8gb6ylXp4GJ0LAPsywstnV5yQwJ+w+Q2gY4D1aqzRY2XO3ccsmr/e4ieAUZJ4MeZIa8UB7pmhSKd8K/lD2AsvAbwiYESM9BS8YdwF9C6rpdkqIdokUValWbWxhreFYSttgwFCaQwrZzdlgguyGhAsHuTRIsKm+KoSUXPzbaxsgVylWmu/mLu4yhUq5gC3PGlfAyLIeylxAGZeqzYcoO5wqj0XllylM/U+g5EOFvDZVNS3hwGn566ClQZ9m3IExpS/p+cCUAKkA6MBQDBT51FJj+ycZ/YJl1bncsJLNWo5pAFgzCHq7hQjx3W4n/9FqVOvQ6MKZco10/p1fu5FzyCgE9imyxGfhdq9VrqP9V21bYQe/0ukSjN/36KWLhgnLl6E2jVBsgQryVbfF9PlMPo4Czyb9Cd1arVp2FvrHyf5OLKp+609vBwAAA==) format('woff2'); +} +@font-face{ + font-family: "dm"; + font-weight: normal; + font-style: italic; + unicode-range: U+0080-1AFF; + src: local('☺'), + url(data:font/woff2;charset=utf-8;base64,d09GMk9UVE8AADWoAAwAAAAAhvQAADVZAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAADYH7QBocGyAcKgZgAIVSATYCJAOGUgQGBYF0ByAbHYYzA4HucN6gKkGzzEZUsHEg9AxmUZRLzrrA/yGBGzJ8NVDfEQEVGVFi061ZzbDpViOjGy/dspBL6h6DlSgfbO2sbIYKtis3joUt7Btt3Bx8QpQf+ZXNAz5yhMY+yR2etvnvgAMMVBAMTu+wApMpdmNhYaKNaDMz17ooFaMWpT2X/efcWhdu+xm4z4KIWrbpvycm+RgRQjaKwqEsGIFwIBTCJSc+RJaK2O29RilC15tmBhVLGeycIOFBidoUhZtj5vTchAVgl8K1KaWUbf+/rzNLkVyp65Et+U1Th4lnIHAY/I1xHGDHOSTQ3o184D/Pr+0f/ntzX1/jGDzwMWImYSFjFZ0GYGGARYxRiAEGUWMRVTa0tAUGZq0z7Pn+//NT7UejnBOVnFnSOlqO3l96BYRMMi6QA4x2QHGdAingggN2wCmE0AFykCrbQdu1QuAA7Xt1pP7DtbVKmpwvesLzN1Tnf83SziRzwMkVEJ8HIlej0j9/J5NMMUuczQHwhP7fPcJsfzILkwOiXPEQUgBi9+PuXOXJkzWWXGVtraiUVVWiStWPn1MbeEUQrsrE2t5Pf+BdgQZsCYSpvBg3O+Mm/IiGpycYYAAa7QUDHADGw/+fqu/a/4IBDGHM0jWyxBqwnuKYQFUqeAW5tsbwQnixtYYIP7PhPCqzToFCEvzp3IY0q3IrfdMNEB+GdDYgQMAFgixegbggrEhcxP6sdbPQ/vQ/dv+k25zNRqvW/a/qaq1lFjegHvz6u5bnKejVcBJ/kwYiyjYAhXv9UtJUss/CDY2r++8Vf1KWLXk1AAECUAJ0ugyxcbjwFoQnToosImWqNem0w16HTDjutFnnXHXTHd9Z8dxrb6374Be/+TPt9ATCGzdlzjIGJpQBMxbscDhy5cFfkFDREqXJJlKqTqcehw2ZcMK0RZfcdN+KNT/4zSeo6NDVO41Ymbcpz3wLLLzYkkqvoMqaCsaXSqm6pjrrbbjJ9lXWsao7W003s9iHNowxNntOPPjhSSJUoIREnVaIKCZN0aTflJhK59S4KbXaO3vtN+0ZznSWs5/zPOe/sEUvcenLXdEqV7fWoYWXWH7Va1rnBja+PYvt6Kp3djW7NX38G0l30X9oOGnTWJpCs2keLaKVZELW5ETryYt86QidojCKpHi6TqmUQ8V0myUsrG07/TSccAiopy98tKQoQHIfwUoRZ/b/JSi7w0jLCaeccZkr2looQmEQTrVWGJkTTodu/1eiplB/1Qi48i8CBK9tdPF5ATGC04dE/U3JgxbvVj5b26sd/1xRhw3kDFuIQsGO9/zEQ936Rzri2dIFTaTfAN/8PxwtTrb8eJZ6szT053nZqYUY+TjcJCZKw4w1akEHB3l+yEdlcaSgPY7E389zk06i1anq5Es6/I+htdJ4oahTpFY2k5P/wbP7JMfadFYJutIZK9ULZwv1uJ5RWWsTZ2ezYEptmTBc+KfRrRd+18ZklqOxy+NSQLU7VxlFUaB4FWNjZdY+jTNKcmrMA0HkrdqT2u4a5TbKGHSpEy3LtzpX1PenRDoQq70sQ8ek04NQSiMPi9L4wXIyUJ9fAbgfLHimayr1DsuEdRtXXtyw6cwlSKYZP1ykVhjeZbvy6pe3yCLr9Caa8sLbQh7LW7NY8lS5iVdhC/NgZ77nxCvxx7es/Net36my/jdJa7Jps51KziL0dglano+VstVX4UUfLiYOgNoDzJ07na5CQQzZThO+OPiZFiMAVkDcwI72QUBFOIWYUZBi64JCYEjo0ZjGnlFQ41ar6PHotoAIAMbUsdQdlrI+IKhFANA5AIxUTr5WW9Q/3so5ZSVzE3LQHlovtE4Xm1R0tXGZtjVcRBfCSYP/NM120+ClMfILGeWprp/ru/R/K+T0b55oeNLBf1ywQITyWqwZLJ1ndwEAgbSBHCBzI5WtxRH59hnVaNBhdcqUarNZv0OG1ZjQZ4+dxvsAqCNge3YfdKofr6ZA6/Q0aD+6NuwHLU2CbrfLiRwlOeW4kzl9mfnPDCwAs5nFPQ+cFRecczWXxDV33HTPbXdzXyzXg4EnHnm6L4Fn4pUX3mZN6l1gqD/7T6q3Ab2ga3PgIOjRvovAKhxlPtfhdCpwA3gPx6qBHmArnMFm9gIjwCRc3AR0WwMQwAFPS+noQBzE/dvVzAU7/qDU2lDLv1Z6rY/0B9UnD0rmbfy29ibfaF+wr/KuAXfl3z3/7tv3DL7H554fymPvbbr3030D71t238b74u5rTBuRtjjNIW132oW0Z2lfVR+qs7SOf528Og/qkFqTgecHdg/6Y5AzaNKgRYMcB/kOAu2lwfGDqwe/ZlK2gp1nP+lPD3Ed4j+kbghnPDF019DrQ98OGzFs2bCdw94M1xm+d/i+4fuHHxjuPzxg+F0NNQ1I498an2kYa5homGqYaZhrWGhYaiRrvPlX99+Af1s0ZZqJWqLWj1q1I/4c0WhE+oiP/03/z++/Fm0j7cvSX6T9pCrWyJF3dJjORp3mUX+O+mm0xujUMV5jR4xtHWc9rmr8lPFxE8ZM2DGhYuKwifJJAyZ1nJQ9+c/J303+btl8SsFUnak504ZPi5j2brrX9EczdGYkzxwwc+2sv2Z1mVUze/zsE7Of6S7WTdZT0+PpKfTn6PvqPzCYaZBv8H/ugDklc8fOjZw3ft71+SvnZy6Yv+CioZphuWHvwrULLy38aGRrFG/0xfNzz0mvUK89Xlfqr/be5+O2ZLfPjC9j6RbfXzL0lin8vo0w/zsf6K+4FKiwMhK4/NGmoNcfOwVf+mTmmphQ9bXHw3TX5Yezja+Fr2XaRPzNe9s0JhJv9m3k7439o542MbIoj+FY7o5Zb2ZgdSL2SvMR1k78Wf6nuFdsDseb2B6Mn0og2H2fcKn1VPvmNiZJHQJNR3PBvXZrkoeSv6T8mfIlNTf1XccNaRfSQ9L7MljuKUJnj9auOuuPZ+luOJ71urtd9rUea3Iu9hzsmZz7V94HeWfyKd5d8x/1Gb15Y8H1PlzUXHSmUG9rROFXcXtxk/jigGHbp22fvn1F0VSxjk9S8a+DglTGm0cXQfVgAVQvFkyRrFZRBxKfvhNngV+8QaV5sBWPYwPUFJJ7TWkXIC/UOChE3HdWZLRhItEQ/fYDe7+N3uNvI/jhPYN7XuHilcD4HqQKwRsb/DpUmx3fJ6dQNbMmMTLC8rFrBQdK6nrfpB19QjeHI/rOWetQP49IqG6a14vf3O3orMc+H6SQnxkpjAc/+6Dx+Qu/s6eiv8Kv8j4lUOAx0us8hrzUPiSe+gvdMbw95mkN8hi++BfRV95eWhz//YwPwRwVIwrisGKuswN/eA0MQGvX/0wbQm8W050hxmdstci3t9xCDXKlJ9w6stqyyQSEAxkwIM56w3NRsOvv4vHDmDjkAdtclbXmK69IvDpU8aKZXmGF+0vRcbMqzH8/RXvmTTmWkMJ8OdcMnfp1N//7hDJX5ya3hTIO19pG0I/PW958bLZE0T9jXxLi7NpUmY1BtaM0pXX3dXCaM3aF16mV6jVVN7A0UGIVMQ9IVirwSIYQMflIBgwWB74lIX0+nAED471KoTjHgUJg8/kYkTlHVBAsOSxGhDHumRCawdSnn/RJfKI0FRirKL/vZ3LsQ7YsUVnRK75vrCgB5sJkbYZLM3hxkkui+lSi8BiEx1BV6X/3qO9TFJzo3x3oP/sxY0FMS9+PDsxhVg6PlXsmxQMY+3Fr7uKNA6Hv1utJ4ZWmCQ8u3Jk6vW3R1/ePog+z6MHIpUcstMhKTd8k945XCo5XYKw/dHBqgrnm6Df2AG8vRVayCer8irlnz/U+2mW4KVC8kH82fFZg9vLfOSvkpCdzZ0pPCJZvhDe6otIeVA8koB4G1cs9bKm4X66YnlKUjnZqUCgsrxCiJAhO/94nQrS3dBOGN+3b4+Gm9heh1PspRf3VvWYmEKTelrWHP9yf7CPElalWZMySdjBgdBpOsLux8j+iq+Hz5K62LpQQL07yvBI+YrVr/NZfc0E4VzyLEm7HbobTrW9azvsgpNpMNR+ZXmjzrnLHqoPTXPG+ifa8M/le3ICaeBanXc6qKz64Y8u6NR9upK9cW1UvV1SCb+SI1hY3fjjME7egT0hhyhfnQsno2eTnrBKxP+bhB7EmKhKPyYAxx/tVmSpYxIsDsFnbvvbwXg+3jII/RvaKqb7aqGbNOGjBDigMYQONpz6RlrtKJZTxS9suQwWEO7K0q6X1J5Gqwg8oxQULHVLtkpCKThM8tZtKOfwY3eaYIXsLS5lVoazInq3tQ9OiM9WNOGELh5YISdP8+s4tQToIm49KWnMQN0FMPbCmO9URijkvL7KZQnOZsjkWTesORGz24LRpleePjp469NndN2caLp1leG/dQ/f2r05DRs+uLy6eXvvnD0GYiB+AiTcOgLff02c2e3ESd8SH2NTDCWAT4jh8P1GchcWAV04XxXLQRzC31V651EgqS5Z/CAsTY4580zViOqDUzohnh+5Sh5k2BeasSORGt1x+agRChG0NYDBSe44qiXoYmxL6BIiV84++1dq8E7lEjQHESrpfC10RvZnUNOAHgwEsX01E2r3r5HoPt9vzC17qbnlftTS2tWdaWKGotdUPl9vIlLeoypZv6dLESqlIpMZZi0Tl9eEJlbsnzZizG+w276TltpOvNC3vvd0qlabaqZM0MhSmFsoSK3ERv+To44AwenLi7VYmWhlOszGDAZkhkTFJyWQKau6lMFoo5PA7Jkah2bFAdhm/a2q2rCybVsQfqHEJc8TyKWH20GdWEGQvucGsdfg61XPoSt1qPlMkFsQsto4PfQrPZbhBRMZMrioiOhqXrM0bR/Ct8ySsSrvOKAlrlJI5jHV1lx+nGi7D72TBuwfnVMY1kcXoSq1vxRF+19hsWV42jS9uDTziy+a1qMb0C6mhEDut0lQ18Zyw44kW6UN9OtI5RV1EvkRu1GD6gmOuWjgu5rlBGfy2CtoHOkLFVqd8Y2KKhWI0zxdOn9cYbyWKL1uLO7DA32Kvr6jJLjijkVb4cXx751sglgT81GRBJkietyzdu2TwZP5ejAWoLfZXN7fg1evSgtMEdSVVaII3rCiokduJtDTva2FWYRDmKxzrCrMctVHi7TkjMgRZteEpm3snKczLD2ZZalnS4/o7DvOTy47qnxIokMeroebusv5v0mBqvCQgdv4SQ2lKOro5i6+A6mk8gUJQvTiKCgFoWlxX0Ii+F3jvU/YbnCx/n5lpEiEnhh7pdoYy4FtTsRVh+1LYoj1351pktsv1h27I452r9hX7IDKWkuc7mCGsqMgQDlZMTRWUdlIEbMgeqBi2F6/xu5AidJrZyxSVqdaKrraXWUEXFNXdXKwUYLAB2qEMFqPgRV+8zd/r+HE/M4UnePJKgBEiVsn8b3MOEtMlU2b3JyKe9JnlES4lpV5GmdkmL290NcaY6XjCdXtNm1BOQzOJXxzt4Pcp67CfR19u8DqQxTy5iT0lpPLyY8QZIJ2qOkZP+a05RlAuKlk45BQkP4+TU9X2l71yIO0+Qvt8OMuaBHYBQaGWPh6Guzf3HKBQ57Ww7/x5+zkESsdqAMNY3XLCl12WFL4K/V8VmoJv3CQmGsOZZ0/zboXCaf5HEXoQgjIPp6BL0MaR+wr27Q292HzrGRubKoEYLQ/iHRQgS0xzItKxnHVwydKmxnmnKNKclNGa3ZiIkZkqt1CNHOmkQ9UtHjuN+YumF7mrLk18/s55VFW8a527rYqcuuVnhiDW+8cuNEdapRYAhkEpDX0mFLYJWzIxcubaSbM17IQW/ZZMES+viRv57uvICz9XOksDKuWKTzmif+Wbyp+TeuWDmaIaeUKsk7OhcwxRJbLCuWg0FGncVCadwkFv9PqDhvDjqu4dDy2QsHrLlwJHy3rTQ7VyA5ZKF0UWizWu2sjoVZL6xpXeTprHTvloP8kUNb1tRzwR8lWaMtXNfRovwJ+6AvjQjo5Lx1C5kYS0h4+1Wqg1e03rRbcp1FzWxIk8C0C9KBf3o/3UNHJdrkOE3KU9dlQIvbNWqa7/lRSp4LBlxNUbg73CLWNNZfN6OtacZ0WfcYOCG7oaMBsVgT7/skyUimzfcWDCjDnYwY4Lj+oXkIePdlVNdCs5nsbPZBUVB7n1HXngQ8Ot9rlo6NVklRzDRrIYqfy4FyfiaI83afWzpDbUCzls0P2JBnwSRkRZAXRxeDURlyuOzAGI/s3XweFxMuQz85hqFa87vDmSh4Jrf/FRF7M5lwzlO4sRqmz/maF+bq/P0JLQqfzyMOrnn/3zwbl1tZOZw9GwRjcp1feOszDQMt85+6ubx27Ce5FW9kmdnzVzZdMlRQGRU6EI2TWlXtVRjZL5L0pLCoPd4Bp2/vMTp6Xyz8mFrms1OfUqqY1BbPbvvzZjXJGrg7NZOrRw6Fpd2HqusxEHpdta7hqHoqgilJKSEeK6b+7hhfnYfXnt7Fu2uo5U9pSU9aKZvQmnRRp5BR3CtliMrBj6RRmzuxyR8o9atVL1jhIv7pirEmQMg8RPJttJd+TG+EjEYcvJq/ljZTYGDr7w8eFdVbXMRRWxq3+fA+HaQBh484BEqw6eB2l470WaVZc2AfDtFw9I7vNY8GgFi1bPjSZ75WdWT5pMVs06lzIBHccfcP3PxuBdF7LVEV927v1lSc4+OHBqsrtEMPkqPS8CsdkyuSgigRFhiRD8NfJ5caM6074wTmWmiF5MHS4qkd9fDaMV6qjcDsWtXvxk7ktDecnSgTYlAQFJgsCApYT7btGv7iOlF0YKp3d2pQ1xcdWVafnxn0qZBwX7vVxZVWWMf0WTBDngu2tAretr7QUzweKDyuw8pmOeeHGoOpyuJUWlftnJsu+zaXA0MQFTrNhTYmxT5J+KlbnC5QsaMnVEYoSt+hDGsvYfHqSWkLaqDWvrwDYfdh2QU5aL7Aza32fBmGliUcXeLERUDmZDEONXLydYTbklQBfo+3EwTaitzWEiqtJtw2Gx+LmnSFblRSYcmshgEW65l1A0hQTATznyBKlsWlhtnswU1gxs7QCGbgrIGdxjLn/fr5nHSNKGz+6a6m3wy/8UUEeZH2xoqgbCAgpWBSvrKj4hB2xDyLmziBHgTAiHeAY7k4cYHtJCc3m+nonRsl9Semqnfp74+qDWHv/kCHEiOfhHRBPex6ortxaQRU3IoiEDnEc/sSpkqFiyfGFWdDWxeOSP1VyYCzMRLHWCta8A2exXVdCBnQza2sR7MLZpt1tBbGsThpnRFh/QjIz5u2yycCJPiZGKzlWm82qpVEOntKhkGbXFZeaj567Z7+WPzfdilv05JXlzdz+JqJCd8wAyfl5eboNFRUrt3NWVk+0uEi0Ax/MCbC+MrVTMiUNPuRGcrsi47UH9Exw/XAxDw5JgNs5xb2svly6yXAROnJdDMSLHBGxsDAU9cOaQi3USuQfkprIYHdojBt7TtiCrxlgz9p0S/pbJLMsvmBi10uTVmKmP0pj8RZC5fpENJOxglqu4KKYnvMxKiZRYZaki4poWkQiBdUyXfyLNYzEax/k4JHE/r6zKeBx4zdNNkLSkrJ6qCAHMv0tomLyYLFJSQjOza7LXQb8Ku2u5mz64hhERe4W9/rhumK/xVO8xUl2hEBIixeR/sh3nCfpyin/YNGgKKbyZSYpu2inLi770Mxn/xUqF5BzIJV0YJ6p0Uwh9zKryqI2OQjwEJ0NfoNVU9Bh3CT/LF48S8c0z+fn5KKkogjMng2yPe2rMqhc28aE1OEmY5eg40SyXoJ3wl0WEyl98kefBxsRJjFZY8BBSMixPLj0k1WaWvyXhSpxsSoG95QivmVpXFqOG3aC1AY/wK5Q6AVN2TlHaI0hl9fhAWDVd3mr5WN7ziV9pWcbDT8c6STYDqE0lA+YqSz6TSGOeb+Vifg0m/HpZ4zJ1nm+8hUo44C0JSpgikpDADxowRNdawMim92CuPg55qaaaancbjEabr6ghfJ3YR0JKnS6jeS1AtOL1HVltB8SV91NXG4yyOnuak7KTdipm6TH1enPUWx/kEj46A6kKfpaEtQFbOgHjiBipSJyTAcOECU7RibxYLMqqfX53E6yWF/jgRgwubdNK9CZBg2nS0AzLGCjuxyQxSSg1hNfGkkVqkMRoFJ1JPfHBPgGHLOlKikSnmGhWSvosRk1DVDbjZ/K+ICgWmqEKs5Q0R4l1TwhVFSehXQZtpaSg9LjA6sHSSmkqL1QdpWlCUw+fzuZIGCzT8ekyyyXLlJTEqFmPItLP5PmlvQIoFkJECqKzBADTOSZy8geNMT/QDBajFTh8FEcrPOkVkpozsh4/Cn3m3Hcl+MxQwIZMSngmKCxlArdexcZVL/6KCHyvp4cB5genOPV+ZGeA6Zrq/dzTBZQzDRD3aiqDtEcNEBtrfiVXN+GMf5YBHwTVJhJeMVFTidTHwROQwOPRZ9NcmoS6TSdVOxMaOgvuFrtT7EcY7kqfgFAg6xAyGwNaZPab2jvUwGwTWZaN6TptQ+T7dDvT/VKdvWsFcpKjLC5eZ7EYjZv/4h5ExAVncGjqiabhIGGNo+KkCuADr6Jc53VNl0fpS3jAGpDm40+MeWBn+rwcZxOFdFOJW0HMqOhSQT7R7eAA4DgYaTb0YyPuxTyeSYjgpM+ol4vxHN+ZpO2S7F72hVvvGyxGz0Wr3lD6qlvVYGZaizUhtHQO6BFtds6fLenlqAuEyKL/ZK8gIgTtjIeqWaDNRka+IH0OeFFC8SPNyEJjQzinbGnmfvTqq5VPLDTJIJOU1TBLm8SwERjv+Cl7qEh7ucqHZk10uXg+JKcu0eqek6pB5OIClPUCvWeyMXQcNyyayVSh91Qd92gag5PU17EkrbZcJqrC4K5WrjiFdWFya/k69XIV1pEyeeXGBazWnSe4aX8rDul9cctA6a8KZkkg0c3fNC3Dh0rCFqfEw/ME/6cOGRlrktUWW2Bfa4t3utSD6Nl/ej+ZVIoPe/ws64vsNwkO1oUwq+cgH6RezsXmeUNanu4gaDEN4C7Wsbd2vrpc082zZXZ51zN1sXjFSEGMRi9DOYgcQ1tFluvJZs4i7XeCx7kGKH0FFwb/3LWqeUy6u0ZG0JBoKLQ3f0/Onmz5SLI+XZBrTqTNpMQ8dTX44aIXCxEG+MDBC96wfeE9GWncw+St3moXnwJD91ybzHK8BwhKW+AK85OljeU4VWmkbmrsigOob+ORWPded8Uqx0HBWBMTIIzjVPRjhw4uPZi4MbytFjZjZitxeKfSb+VwhsAergtzFpFIIbnlYWzPvEiFh9fAXkw9Y3PjEMddE/cOz7s2Wu1GlyWH4oMXyblaB4xlhAnfGOPzMYm29+iNpPhaaEyNXU/NpoNHFodbvZSPNksQoRi8xdeA3u89syoy00bWD6xnB7FuQMM8E+ycCzkrj65fp0ufCftM81WbZo5ysyL42tUd3EFUji8Y7BUBtjjGNnYr3U/Bhvy4clJL70QIQg+YyJMpFxpQV/45TMyfgqwB0n40kaJ/hcUq1DGQw9hOEMx0AE4/cLG3oZIP3PG5p+tOjpGYDLexmKs1zb4NdJgdwCV4HtzeZvUe7N0VejVqXDfBjlY9QI81UY22EQMmo6QoBQ1C8bhpx7tcqGl4/vj/xZ6ZUcXyuiDBF/qLCamgxf7fgUMsihLb7PtOE2iSjq4P/vMefnIGr/IKdlbdUUt5vs3+JHEEynBzJiK6ysRWPpIUl/VDlfBLFEvxlEUXWlIqCLZ/x9PWwoiEfIB6cBLAU1nUJvSke426CoDIaCHBYASgZsNtu5i16eNSlzyRIV4d2pO2iJVfkzlHyBvZE6xNXvmgLjyJ+gnPLSRIk5E+kZQot7ICPR3+MZsXhTASYcD5ah+Yf0o344FcOl245G0alz3fYZrWtHWlfwy4wJnb6x4QPvmAiDmo8QU4ORhLKOUUnASTM/o9WmasJ8K5foXExxleEi8RGLFt0P20BQWNpTWKLv94LYG/4UkBwXEbFM9JsEElB4pC8O39NvaUj9pDy+n5rAcLpz56HriOqQijsaZoyajLvI+m1SVfRAUncpzKKWtma7q+EPCwhM3rGrBmxkGLjBSnSuo3GKPIgR+UDDuHMJwXaE17WnMN2V/a0TRfup2SaGW7sExvhKDPqqE9XQqGOQiwQlQULzCjGiPZZTjwxGk5RT5HbFWbWztNbe5wC9Vm0jNOyUiCWNLfm0jllx1eUxTeINvJfNnu96cs80OuABO9ZamyYyZ2XXa0j1QtVSzJxYeXtOmc48HnLCUghbcO3Opp2+PCToIhE1qeoNjjLGvwsdOeIclRiSxmhex3/Or2Hrcube36/PUb7Kz5Utjk2k5MSDrBvZW5J5P6tIuj67tGEOpDkjqQc0yCktboCUQaqs6+gPjELicdrQ1GLAC2JMZCk0AsALWPqaRYNQkorngwkFuynfBIdhApHr9YuGfdi7TKvgl+u2Vse/FjoqPJGqy0vYugNNoV+d4yD25FhvY+WbRrr+t2j9bVj94cj4wz6bhz0K0WYIisX2U2Z5Y8UyW1VQW10KiBGFlmvVYAM6eFj0pjSZt4Ewr7in1cGnhwC+0G4H47fP03ZzGmqgyc3CqQOh4KdSdcMzHxuV6gNWQkNxh36Kgdj4KmoeiAYe1+exjwydYaK0EPTyWn3wJUeQybbvGx1WSvfZNhXmpwJ/TiHq+gnYL0zJ9IqD5F1kbvjoYdB6v1U8tzzwZSCkp7hTqDo12zd1SfcB7kNpBTslwLWqb3XYag2rWadFjM2iFD1chZalq7aa2U1Z4p4CuzddENQq1pP1FXN3XBlDFlgpuDsG0IyoY480eSYyea3Glrkh2JytHOx4WSHYduu2QAwyYWt7RZkaCshOPYo0oRyLLR99l1OPOO6cmWSmg3+z7oC1xAd4TR0LF4yyqXefp7SF7yyRmCyiu/27A3zYKkJbHJq/g5OeHIVPtnl6a7tXhFM9hUbxBZ9K9VOmhOvPJqzUwbF8/zykKYbjzbZBB4fLYYtQ22bJupJF+lY9aQs9NS0xdiF89AlIJ3W2yVze0yTTZ3EmyylcjkC4yHQNPTo3ny8HbxiRIXGiiBO25uhLpvCm9+aQKFP0HePjdY9kQEoNCLAtF3067NahPev62mqKaYbNY7MycJOXIEsV7wXQWfKEzScQLuCfsXGJnKhBR5mnCFe65z0ujlSBVIz9FF2nbasHJJ/8CBHVRARuQsDH5/uEFJDR09mZuHvbQQd0mfE5Js+diyc+UB59OtJ0hrhJS0h+sZENDysjtzfwBe8QDqVVdew5T2zz3l7iYmv09sMti5ILYt3XY23Sw70U1yuY9wb9JMePa3EGcSkUs+MCebdNhih+yvHl85E7BvOjzZxUlOSoplyhjhFh3OfiN/nxZX7O3Tc0jfus5+HZplXIbsVmftpWS8a30nbOh4UT6oElV/1q706Pg0H2uFyWfZHmRDoVll20uRluA4XW6IVrm2fX9aJPM+qZygrdsd6Losxq6k+rmrocah9026RIeisPfQblq6mGBuZ+2x+4K3cy402GFup84x2KxEoTnD9zBqHP5o10c1h8patYL6fUuK+VI2RfF9C74XV8CrexFvQJAgzg/Meh/pwh1aoFa/ne9GKdd7V2Hu0Kt5BnAy/n+AepVGMF3WPmXTX26JsvHaUNhcajEpIjS3MFbeXReL5sN4TRDdhnujh0oApi84VFBzDygdEqLGPirEtm7tIPYcVacwK706/izQJdnB3IjKIzKR8BkE/ch1bf7hnOgBF4pWO6lT+OLpU3EPxoJwBIAm+koA45s/BKQSh8IYzD7FxTZAYM5QWz6dMrpglNzVfaSOFrkzIhLpAvjSnPUS0AOvK4mG2Csm+BQdxkvSGw7MYGCVyarTUufpidF1zWtepNJElx/amQll/6+9TXsxfvHjt4E+bIswGm8WLRl3PY/1/VpAriSL2UY4lqNjx4+N8QS9xJ6j0l4ZoG7+ypspqHKuYrqwSu9Qx1CT9kmjxHYv3GVVq1hw1ut1FXc0SZe66VsGZxUAFo/Rs6WcbigSeHtH5zJJrp7EB/YtDY/aqS3r60hKc42zxS2vlvUNIqRt5dvfbRimVf5OQBA65ECcAYSp9vfmgV4AhSAwvqmidjR6UZiYbRSCeKviVCzjoQdzEQY2gQQiAmZjl1qJemjg+eH1sag5ZL4B4Z0nHeuhDcdSzCJj5/IhIYtYWxE9browb7oW9ShBb6tYQ48/DjM3s1+hB6DWVm1zh6sYlBYc8aS8NXRa7utOfQ6UoXoE2Czpe741T+Er9dfWGYL/25La3o5HuVpkqvhH1bavlDb17VQCjJy6xpnLp93dRKbXRupZGGktdT170oylcxUBVFRdeclpq26zYgTzJjeybO+b7YeMs+oCrsgFEx0tfQbvFFC168MdYI+fAXq0QGyRMHGnxLS+zOAnjOJ7I/evloKRehP/CzfEMDL7qlCZtapDoVgySAHGWh6e91hN1JD8ATnlMW+9rcvNMzJsh3t27nBmaEkT5t88MLbxw+Bnj0qegMfUj/kRDIJdXJV2zNteXqFszhJDXGNW0HYXmrN75eXmde7GT4P3t3CaWKVBKHFSYwSBXZ3i84pMXzyWgTN+hdIy4MlHTbMc0JfQaksUiFfFxExgLfNlIrrmcj3d5CLhDzVzzBY8uCpv8uRkM/XwBEtszZzYJ9yLDFUE/Xo87Zs9n7brtveI0+FEsGN2+Y4a1w7fSazcIFTCTgeugk2YlZuUkIiZMiW2QVZcTLgCWViR00Rvvj5IQEy18Ds/YSbtCKzDQEoDCKZdJEEAGysAvf3NFmLbEx7ar4+PPRFORu7BbiuVoGUwHvKk5Ayt+CIq10RLd03CuXeY8TrYcv4P3b8jMLqzDGi6+54vjlekgpirIB8coZ/AQJv4PvBetkzNlyAajauEpo6JvFELRyu/1iiLHSQjImEI9wx8dIcEBwinBSSi5GXmgWEwGWbkt2jafhZJTOQjVRDjvj9lTxr5BZ9YQN3uXuvrXLe56nLoe+glpbayq83JRsvR1Q68xvA2wVJ8vaNtwXmc+XFgeGFQghBzuPtJpcel3wOTP8vzt0m22S/1gUW5x5LJxyK1XTWIy8a+sa+YSOi0xqI2kflefk9M6Bvp1qbHLHdQ141Ybk10zLF/rNQuqj+BmuRGt6ab0yF9d1V3HXnW92mPXjufLx88ssz1S4ZNu0fn8hF734lFFkIi2aLQoHFOgwDiDAh88J7mEboRkQvEeJE+amYvzNknuTxFQoYYVIHX/NtoKnOuxX3xbML2E+2NsWMNfw83ox4yQfAq5bwzGNFCszWs3tF+YJHjIalX9URNHpEsX0GbqWJzL696P9ViWWhaHssJG4RWcBzxLqNOmwV68GOLzY2MT94n5NiUNBCy6X1b8c+hjNWnFfCNopdG+VUYru08aSu7mo99wUJt+25xVfu/wAZR9VmZ+EPgZUOLL7oTRq1poBZ9PK3EN/afzha77ymx0JpfkUI2ne+bzO1huBxlh6I710ot/OGQqMKruMiED/KR5PzEFRzxRIp71KX4PQqN6XnE5CXKiuFuldPUNppeuhvc9cH1M0VOaa9wF3epj2OnIiYN5wJ31u2q60vRypmWM/WwB/kTny9+/WOXNzmp9cfqj/43Wr57Bhi+iBRL06sAF4xEaYm5HlYkmeGmJu9A7PlovZ5VPzWqhGZDor3La3YfdHKuG6r2YffvE6uXWxF5KBVZreu5ko+FhEbIzHHGrlMlB1H9zZKEEQP1uczDwZ6Qm7yLHQ0YRcqUZx0vflaPThN0EvbZzE0RZanv0P7Z0j/XVJwL42UmiqXmNT/6diyY4ZZOE/FbjOpmGSKDH4MSk+uphPS22aM/Z8mm1gxgpUjztkJWI/kDaFiuXnNeDDM6+712wIXheXiA/PEaOXGmS3BRq9kHoS8y34Yzy3b+8Jbv8Yr2CrQzgsJmidXV+FfnWTK5ldpmkO+DPSvBHKLdEVhc5qFJ5AcbETCLObxgQLkgQRxxfeGNPoafwPXwRHwTW9Bwc27oei2dHceJLEE4bqROj5GJAsNbqzRqS6qBj4H6s09P/bEkcsV/xwXZTFvbHPeNG3qlH1F3Gbx38YOcYuQdlLffuNv2QaNW9zkPTo5y9XIHaqsG8oKWvRo0EEUPVw3TfTLqTDnGZFR3FNTN6c165viMpK4dV1wtgTnz5weYhEy0fYcFa9RUh0C5t2prurrbfWR/U7Wn2lexjcjEu0HlDgX/JCLRkrQXDrbacX1DYBM85cML1NrAoWCg7TQIMZrlHiDm3LHQWLnXWWPbbxMbITBHDvuL3AWpMOnlW1tsUNvaJV9tO0ZDZYpk3rJotVcmdlHthZWQnriUUPCyyOkx88dOIs5RGDSYBk5hlbjUnz+gE1QJX9710cGXlR2DAIsFaqmIEU5trVn9+rJgRMn9GzYucozTUJDLOMJ2/OCwshm3Y/XzjCuUsULaR2kWe6HlWFPN2+ys2FzYk9ksxSvxopIn0C4vWcJbGBm6S4NoxyS0eyCPhoOsss7Oici1sg2di7euJKltlbsHrZ65fPqraYFRTurnKLj2ta8r+Eiovf0IOit/dD9wjXSKdH5/yqxeH6JAMtSlyPLp81blvkJvPyDDb7J9Lb56vYz0zXpmENPjre9tWTCFnL+E4Qa8nQuuT59Z7ga5sMdn9XyK8uGF/b/nBBJwzl9rX+C+6ISNvjBa+5FQf37d5WUAZw3CZ1/R+f0hs3o9iKpC5HyZ0GfAO5XbinQ/IWOf3/XGkzeZrwo+7/iYccnWd3Tb99u2h56FnSmoCRsIgc8cmzVLEwh+Q/pYqZvh3m1EiUOJToIZMc9+SRJZWEyb7p7hRbdeEPzzsLUhohODq0A8CSjFMYIVh8eIcIx8DK0ZUZcW6BCRBQO5ZYBAZcrP1GXfkO6snTFPN2YMSUVjZPhyWmeuYVwLOl+9NR+8s8usQpLexB3tYlwilCcL+PXgDv6OYJGAlUcllwp6rCXYhVs6FSzb//t2ZPe0YcuR/+hgeDEjogoTDf/oGOrCLj8QMAM+PxiH7whAtUeod8rOlTucosIfkYewx+nE8t4MWHrGBxlLuHdVe5gUTHo7ns4m3zTeXKu1dfuBCT3magU7Ll90dhOP0zb/XeAyGZzsR5wpjOZxQPA+kCx47bcU9hZh0+lgaukKm5r9zjm93It/zFeS8PXozcljvoW84UEXcupDehqgxen3tj673dy2Pbtbog3WBkxNbBedwyjepZtlQ8cGrSUbqjBD3ldlIeNP3U0/YfbkPya5wJk7pMekm+J/187R9gDM+xkbPOjJmcdxfDsMvm8hL1feyyr697o4sSo80e5YLZmQJWvBJ5lqBt3O27by/X7jpK3COPcl40MsGs8HM+EPPDHZQz380fQQgHBxJ+lwAliDX2AR3d9W+UGJDSxDCfddwyDm9FLz9Yv+LE38+C79HBWZMJRfWCuoCAO+ZuWE57zfBDXGYAivMbHxB3LsoRnl0/b4dNlOs4lwtamqb8SR/4MnSRWqvvjzDPGmXE5Hh2L+SNXEkvL8KQ/6tDZuMyC9b23Jnt8fME0ixenVfQLccquFVNzqLQKlLPQKF1jyUFa66whp4nqjFpI+D+JOmzftQu06Tkiyf5CG8cbPZEm3tXZC81ydIU7kMKxXS3LyF/jyONxEq4AZyvfu7uZu6AotX7h9S6d76XSee9xeYh0dA43I71Px4EPObxqghkHbNZJcKfJ3NbVzikCCcm9RDz+tPpZkpzZAanpaeuDauh/9IGp2GVRjbssW7+/QEOE8b+m+Jr8dzrBtC9T05J4ssP3E6O3xEGJKgK8hmrzumyshZPfkieV2z78NO5pW5Yly8Z7eBfXxy8fn7BqWrBVn1h0I4CVrjcI1SIV5ELO3x+yNN+cWJFqgqrMuYIVP4r+DMlXaFumzNZg0ds+C9bf/ipRZBsrhrn4b96dEfYbFfpeGQC0iMYIClsCWH0ly8kRIA5avVnLbQGbOhttlv9rmCMRp5otr+Dle+U+Kgn5c6QHTjEyn9FWPmZvU4tuV7pqLUuKvjdDeRjAYN1y3TudtWFxUXrfzHGFdl0GdA41wdwk0tUQfDgxFu0OugkxGrLh67aXqnx7FFQ3lnTxyWRWqSi25uekFhJUKSjpLA4HPRQz7IgzMM2HeJgmcqowpbckgY3DM4ZSg9nCEb43rEiHzU9wiu6kg3cQNabGsAuU3fKw+a7fT1wRhW3vbDeLeDvbeChcFjyQy6sXepZNTEnSCIOje4w6Q0qbn33j+Do7ZS8qmzwFPsE/gLI3bF08oVHKvTZ0Wvi8jYf9q1Era9A9c61/gVdraV1DeP9JEp+s+VDXtEqf/79ddfAp7YkALCyvf1C5XIct3c5/QOIKZeTYdCeXVTxaGen5rVkJ8lyCCP3h5oG349IYVRCFpg9niNDlj5cJUaxYtWBkflxlxf/+17KMyYpvkovw7ScrCupBT9jzXEh0x8bR9bYXzl1tHg8rn5yG8fw3i8LZczRIAHTsab4OD/RsHwB0eUBZ/4nfJVIe1WB4gML88uBFTEj0lKiTXpIKrNJs/ry3PuaJarFmJl/brgoH8wrgWGS4prtp9V8l2c21AWqmHGxi/d9UBfoL8jKPKF1rBm+s8jP5HcDWG00c8uGFz5K1ynJ3wlL3vjsNTEt+ySteJy9pZK8sz4Re/Unz10DVqXn7gjbeYL0WeUThbU0E0LrbU1MVjEWr6FRILxYAjEhsf5IHRBR2XEOwRNnVJNSbrM+VxBORsm5i6uqqm9oUNXBP8Cz69KnwvmioLiZTxE8m/YGMtCn4hmcTRfeh6y91F97Tk5QlnIff+vXqyHBPExhv/2POYfhsBcO8aCACgDOA7E85YcrV4YMADfv3kFIEOFOR7ExoDyIDwX2/D/EA3qxkzhes3FdFTuRhouWRB5QTch52B883hZ2t/ASCxX6CR2MXP3Q7vY1kzlCbS42Y1/o8EP07hLKBXZItzJb3E4mnCgcVLdNmBXk/AfV39BTzBQIfUpAUXXPfAR79ny1W0dDlKlGnQZcWvEDYnLLOss8+xG+2uSi1Px80UVxtFjXrt+g0atUP+zbBeXaUGHXXWRTd853t/2OImRoZcpcp1ueejDeHSQzPOKrscu97ujvoXdrkZ26SVSFWnVbdBw9A3OFQp/ZYGQEnY75o0bI8th/YyVlo7Q0kGl9D+yf9//+/9/8+p5H8qU2Epi5R5yixlmKKm4ORi8mTyRGrnvwyH4slgUpoc+OwDgSnFS6VSrXEMgIxX8OZqZq6FINJuGCqQf/4XikAJOo8yoAAVoAodc4+aFFQ06rkmXwGRQmJFcp1SpVqNWnXqaZNq0KhJsxZMiFZt2nXo1EWHrl677bHXPvuhBh1w0CGH9RnQb8iwEaPRMTJm3IRJRx1xzHEngnPSaWdMmTZj1lzqaJm3aMFZS84570IUs7noksuuuBqVWq657kYaNHNTXioWshzZhPE8suKxJ556loJ6MM+98NIra1ax6MWE5Ne8KZpxyZcff8ZMmDKDQenQpYdFnwGnxIkHAQAgANyu8pdUVdfWZFX/DADg5Ts5rf/+DVYV0//21HfdPTiAB7hSeYS/9o+8nTxGKsWp7ce3KndjxnZHZbklRiBUmbNsrEuRpqssXnzmtzs7NolnvwS/p3TlXOOP3Vjq+ahuXHnAODTp6wXRAdy1HFczzDtPDXvh0+4Xt3P73bzdnL/15snoowPuN9foyKPNTWGRsPiMw4V+Ti5VjnQwcHlB0MHM95rhyWEy4axffuNBQ+J78LiLD/+zrh3PM3dkutQ0ADcv4cExj9dS5rh5XPeYz22JGLF5Bpay4JkngjWNVQHkajwOuD0W9xAD9vxeUr/v0pij2NHkMVgMqsUYnJfwMnDcNgBAuHBgL/BxKXMWxbk8+SHZRIHasyPPQ+7nKmKdIRLkM11KR/j485kzFsBeKQv96iTk+YtefSA5H0YgI6AT32/XI+AFQg/CBqQSHgX3wAz8SOqyA84gtGD9+PLTziur/klAl12xwYd7VgEPFBAASCARHkCEqQBQDZjXgAABVIP0PbxerYEHADSmJ7hLhgFznUT0E7yPIsWqlMqTI1c5lkksdmzY4t4rJv7RMEwZ3FgWHuLziEx1inzOswLl9TIVxKwkzovIhZyqVGZjysrZLFMlYjNZRbbBwmdv6G4yTxExFlvJZiiDB+CbmWpTY+cAAAA=) format('woff2'); +} +@font-face{ + font-family: "dm"; + font-weight: normal; + font-style: italic; + unicode-range: U+1B00-218F; + src: local('☺'), + url(data:font/woff2;charset=utf-8;base64,d09GMk9UVE8AAAuQAAsAAAAAEpQAAAtFAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAADZ1BGyAcKgZgAIEsATYCJANCBAYFgXQHIBvMEVGUT1YA4CMhN8V6Y+ESKqtYNa/7eX5uf9+293ggTtzzIX/TbebHn4BRAUYDxhjRyoJRKYzqIY7ogcOoxgqw+zy8U3nedu8PJ7CLCoMApP9/P1fvRS2NKBb1dPNOiib6SGJvA08mSSVzFklmGgnZQ/uJTCXRljPU5TIVcbsZEPJOW1R2QCABDxEEYTdtXXDA/JDgkH/fa91LIT/9dG4Q4rYt4g6Pc+RzDoJEWx7uacuXGlHiG/pbr19Zh60VDsQVfRGyvvjjKglNPU6PBxMmi+BRT0KCoGyYPv0d/xs2avyctNIq1yClz7on1/XVevkpPynYb3mZW7tOoVYpl7Pq2u+RIK1cLhHo5/6FUl2vPkU4/+cE5XTIdHdUuA95tM/pS7853XvX2y1QEfBLCVz/IQNH735hf4COk9Ze8lZE7i7LihBBpCLEA1BaB5OBBjX4gRY0rNAgFej4joAgBWITQqWISXwizQrJCxGIh9xFy3WZYAkv4g1PxzvDF/A9+DH88wKNoE3wmhRcSjKebKcKrQroaOtZ1jdslDY3us3utt/W2bbR9qtQ/6DL9gFRag8qqvABGTzrFvwDA09DCcx4emouHuK/yTc1RIaDgrFVP29J/GgSVFTxAxJ3p4B5shumwkAxeAz/hgfi+H/wdrwE222fulMpxbaUEEraifecgs8dgXK2Ul+du0Vq8Uil1PqwZK2U8/A0Rw0H/C476GAtMJ0Duc4uk1YEGSDCf/Nyxsgc4bpzz9jMsrSSlPy6E+YbBaWrNzsEqyf9GTgvOT41Nj2a9k0t1TVK9rbW7c+RCbk0qBNBBSzHDIiYI1DBGdj9OpIxNlbd/d54hk6nsguyCjOK6Nbs6M1BEg/f0NUpsrSozJicRJo5ErrIe3JCNC2Mgp4DNCIYB7F4OAxn1sI43IPdF0oybjVlpy60HaN3VlfsljE529WVipQUfXqqNCM1RZ8iUWxQK6KEDIjXvjZ4mdes+BhntegRMBKP4AbhKM4kxsjkwCPuLOuXadA1SQqKM7ILZHtaG/Zmy5Slw8IXTvZxpdPT8GWXKJI0GzIUNHMET7APPIewy6gVgYgbhB/ByLw5lTkCByxE59RUnLhpPk1nAIMzJdvzjDXp2+mW9IjNgZKE2PTUBJmnb7BHqqmi8AT7CMxs5G07e+hh7V1GvK9TxBjhAhfBMpOGWyLIZxRjfMTxrOBIlXWSuMvKTDKDN+nMFYzeOA/uT0WNJMMTy5lOWDCqmKEOApWUONCeuQ0COzLBgvgMmT2AxfnjBZFnhY3gSkDEhHz6CrvA5H/mtMnUcPibXF39/F2lhggg9pXeF5oFeoVC/H52faV6x/bKyh26j1qhUiukeJG93AbOjkMYcwJmHwYd+ykS3NeuKaylHL8+4cVthjekZcvPIezTmzde3FxxYJzT9Fn/jz+07JbshEARUb69ICMvI0+WZ3W5oK72qGTX5kjFvPD5E2Q3uPWhSnvpDDvOGc7c3e53OCHBcj36pxRe5RYT4AolfK4/2LG+e9ce05zfcfnUsf17/U87zJqni8E2WEBH6JISE8SJeYklkdIObF0ae3AeHRxX/iegucpePldeDDRYlxsK8zPzswpS8mM3DwNBdMmqcytdHVo9di2pnOk1d+GS1R608An3L4EDu8R8Tsols5bv+OfW1dx3jiSFUfdgfvuHdtDcI15CFotH5N0bCP0kIIKeD96aZH+Xk67hYxaPlrisvgjdUmUQRhnaSRxHZTbltOS10hAhe4BwCmxazwP6LLmyf/WcQtmbKvJFpNMdTEqw0xDsiF3cZcuoyOnkISrJL9UvLYD26NYuhJdcb/0RERP3oCuABYJaaulNnqOOcsz32zLfHug4jMeQR6m3MMEDP5byO2e5ueRMynvK2tmes7QHgm9pOug5FDMpHTdZCaO2ww5zB3zAS7XgpxXBtX36PdB7/7o9zD3EuUWxd05vPZgny8jW52SLt21ocK+Q7iyvb83ZRjN347IzsrLFe9Y1z53s7uPmYwxqjpIy9+Pmq5TenuJVh3xvSC0DIJk1B3BN8Ht9I2nxgr5stuehvJb8si1/ME/QetxCHqMOwAdyMaXa6KNZHk8DWC6zB3W1YaFiH4XKdeWG0tqWHa2nKqRLPrAmc91uKXM3zlQVoQrwD3bXyrwMusoqsR6rP94JSGvaBlY7RdBycv1Jb2VyWIxw7QqbKWeFTJTa+MtNxpJDsJQTsP6e5ZP2tvPmjY3+RTLmp9u+ivqtZvHR+aY5a92jN+ikdf6ksayhcLvEZEpSKpRBs2VqKsWLrKWYX2tPNejWuyujFniG5zeqZX4aUmls0ZyXOOJU1kBTWOrwTxJu4OGsypWYIVW6B9xMYItfbvTV3SDQ3tp5ZncTbzoe0dn15yQTjbI/5Yp/w9fv70jHw9NPzq3ZtUAZrIuMkVW/JmEElbubxP7UWjz9UCZpKG0obpbsqlEvn+0V4uYVWlytkin9yEDTtqgdkuE4Xlmz/W0KYxuu965jaj3eNrW+bqp+bebDLdwStvT11WuXr5SUOGTCF8kvKCwsM5Qacnc54HoqaS0J3pROG66NicEktvFyjZtCL8Yrj9oDY8LidrTlshaGscxrTZw1/C2Eem6MHkaJmLiv9vepCy/3v90GKHtA8dCqfvQ9ipnUCYsewMJmUNAjqY141Ercdy4eTGs+xT3+Ih5BhY4PXTgKO9PJJzadOCl+nPes7J10JBXzb4qLi5iJ65c3oGCAlCO6HNkzfacLgf5l2HDuUsMpQyIWvD32uW/XoM+ub+n9anAfY4uP4Nl9gBXXsw2U9AgqCI8ajh/NxAOT25LNyc0asEo+kHGQHk4FTQwc9w8ekwADD8KjRhjl/9OYOCVW+FiYQryI5lK7fuxadreXlzNQylO4xKrEQ4YSAGA0N5CtXA4VIPO9SdqgF2z94ZQur+spQEznIlnQYpn5MPaCEX07+3+kpnyOV2Bxe1Q7PLojYq5w5faFV0j8i2LMJ6/ueFj9jIY5VFk7iWdTWVsy0UEmvPACPKh7GXXQ45tk34GwtVWyE3vJ02ELtjlL8EQf3B/LYmX/XyBO9IlK02zSbgqnMZoc57VrRvtOnjMubqyDUM/1w33bdQ/h0EMR43Zk8GB3YzA5/ngC+YgyAQuGyMSyRFOM2E6l1ySsXZThoQOLOAmHJoXPVPnaLbOo/hYzuZwSRpV13TdMNGCNgcLrc636tXAdelhmoHGwvlhv3W6zp1t7Dtxg11xb2z0lhcU5uS223bsce/VFiMxnEYEQskF9SBHTZsxbgmgk+EB7Ya5+1AZzxONOGLhOFYwcfCLKKuBtJXIQyQ3wbXlXELRqZKIGuWQZAmvioHxMFD9x/Tz59b8AIZaDsJ6EUH6tzBvDt/WWp+nJ65Dv2Yt5E72iAJVEStlJDyaUJst7ycVyqby/fLDcST5GPk1eIzf1699vcD/P3NLSL2fkDvI+cnDyg6NFq4VWM7XzQ+fbzjedrztfdHZ03uy8iHR6d3zquNexFxFICiw5YO+rDwmxP1w8Zf6RAGJT1tHHeqzpMe4bzReYEULoXsTJ03zl+INHfjv++m59UTAB8RAf8exN8U/+ApTRbj4kftc4SVv9fgAIwStAwvZ6FP7wIzaoQZUYDZokeY4uO/ud/O2I5Bb5F+NXmtvnmGU45GZTgpAVWtVpJwTWCKEI13UNAglQhAYP8VCMBh8hFFdV4C+eRGI3byrM4acKEUpnIz8+fKnINZJz4cTZqMWsOe0TufV0LNKNr3/vwJo9kIVCAJWcqAX7r4fJTj7Kx+odJvFUclR6aroIL/+hpNjAZeSOrrr9hAgm5+ztVBE+Qr+5fqBy5AIAAAA=) format('woff2'); +} +@font-face{ + font-family: "dm"; + font-weight: normal; + font-style: italic; + unicode-range: U+2190-21FF; + src: local('☺'), + url(data:font/woff2;charset=utf-8;base64,d09GMk9UVE8AAATcAAsAAAAABxwAAASRAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAADYdvGyAcKgZgADQBNgIkAxgEBgWBdAcgG1gGEVWcQwE+ElO4G3pA8hc0hxm3NGG6/8up/ZE0IXYD5JQJQigjvFYGThf5VABWk9PS4bI3/ONOAHFwCxNLNbXr539uoRZZW+ctNHr4gnzB5NNGgyUgiUYiIVpoeKSemIV0+UJPp2JbqHYhnb3wk1sQkAQghOQtP1J7bkNdbd2Ed62zY/jbl7IRwEZmARslsNEiK5EsWQLPzxKrPgHLtGmDHo/hs7NS+Xn+USVA+usP7yneyn36YVUB3suHHEIy8suHT5i5YPm6rYc++unqkcbGujbHjdmW+gr7CLfIUzhDQQrTuo/XttSemTplzszcNcs1Oz5nh0pMmTh5MRQqbf/oY/vHXP9DXwMOnc+b3Hz+2LnfThD+zwwgPdH0l+igeGDa0qWNYv8HACGOdMgUgIAA05z1MbwnI0g7uUL+FqYIF3LQo5Iv2a/iQ+zRyAbZYCgzxI1xQxXa+fHrj/kaHEPZFrVvewqux7GHn9KcT1rZn20Eb6kijsRPNDgW19O+3SrbkYpj+JrH1/lxtNO4wRg3lBsMsmyo5se5i7plv6viO3ZK0b+ej61sX8G2pFQyH3N9SwqPp/WReDhSFpGXpCqD4FiOzz1xI67ZBFu1PBKOk0k143po33Kdj+FrKNuxom93Kh+L6zcddktLJNPE62GS9LJRTphiemktn1+bjYTi/ymyysn1LXgjzqexaCLksj8sR8mGSRzehAQhUE4EcgaFUOeSv1QH/ccea54exiE4nPZ3pfKMbELUzfwI1WZhz8mhRCwqbcaDKQ0n2pIUZfIYgfhvojG9KWEoNxhlWV+d84KtI5jCNop4FodrmtCIKldv3eZv8C3qmoO9vOdzard57OV6vdtur3Y63VarxWs2lX3Je6kNT/iLbRLZj2y3xhvyBgNlfrffcbFKseu9et3r8p6m4FLUfcbf8Le39qCKahM39hoGxz+VrWyO5vOc8s1kLwlSCdcQ5qWoeyhVfB6lPBr1Kkr1pUtenwM7Q+5gVQP2Us9fKRWpxuw2WcssXttFR5Vdibqj2gTnP2ju5bpmfINv997mKlc/R+PSCM2R48nfE4sSvDWRwo/6UuU4+9ODOxNpvNYT8aSrGdcyVUVRkuuysq5FQxHF92NWdtI2uqgSgAY0CABkQIUqZPnK9dshDSQAgMFBAADACYw0mrBYzvkjzbVQwrRDrST8TBSgcpCYJbwDsHe15AuYOjthYdfuBVNNaip+PYBgECR9cW4PaCDMFbMa7CYVKrtIirIsEFmSQGuXbKRc08HBypbsnw0cCimurtT2sysP5c7/L02UfgEA+K3z8VP5dNPte4OjB/7f19JCEEAEQbNJfDyAvBmbGb7BLxA6PviHt3+eCEHIgTI7TbdTnrMaHXZaI9gMMgEgFfYSgUjpANAJzAoBCToVAQTQKyIAGEclnkKhjJGn6Dx9mTr1OjQ645TTmml9C1pTTTbF7Cs0oXVJ6yhXaG2uIJ+fN131vD7HnNPstBa1JtJa4jxjyFbehNATsv0Eba1Kx01c2g2ZnfCjmpxRp5bWFBNNTqsiwCCr53WbagEAAAA=) format('woff2'); +} +@font-face{ + font-family: "dm"; + font-weight: normal; + font-style: italic; + unicode-range: U+F8FF-10FFFF; + src: local('☺'), + url(data:font/woff2;charset=utf-8;base64,d09GMk9UVE8AAAJwAAsAAAAAA/gAAAInAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAADYIOGyAcKgZgAAQBNgIkAwQEBgWBdAcgGzMDIL4Mb4gcpY8lI6bRFpuvq4/OWEUk/8G00RY3bzZ45QTP/4/7fp977/hhqDRrEowk18nij7AjlZhdyoMv5jaxZE2Bj3goRCo0jjQxTYTacALb/XAD3u4CDzHDKgCagaf4WPsskuLqgQ3PZuX/n3qBs2kmff9/N/kG2b0PAne7CZpHlGAY9xJq09hCASUSoGTuSs0f2zl4eIgSJghGISKxbXaRXsuiPBy17rvw/Wa3uNuCe5R7qd2i2WdRPsGis68g+3ZpyKuftUT4Ml8LiPrqLUz7F5d4lxWHEbH1TwVznkJxQVFRUcPp5HpB22zRiZu6iSW3/FDmps8biI7jmrPj9tdyRFd98BQsNBZBEBFl2zjn+X2+8fxv6Xy4zzkaIU9y/93vGj4f6Sv2n94f9TP6mJ9Hjhzx9Fp4eubk6SNHH1qsnnViFoQdT0KAaDJlk7aOyw1hMAChEACF5cmnyrGLc12BiEREu1GP5QhhGG1RP8C4Wq3coCSGCSj319jF//XtoAt9W6KaQI4DVI22ALkMAAIIJP+TmbHrYDBPAH5p8KtXb689cwMKjQJAQH8K7tVQaL38SkDoDwAARJ3ABkOIABGM44iJAtYQEwTDGigUW6CBnayhJWGkZwXhRfafy2pu4jDqwF25ShQpVrUSDJE05ZqfwVAUZ+7L7O+kXdQXR7maswwqDVrA81Xk1w6+d1Rg+3GHm9q4LECxAkXRaxpCbqELSggA) format('woff2'); +} +@font-face{ + font-family: "dm"; + font-weight: bold; + font-style: normal; + unicode-range: U+0000-007F; + src: local('☺'), + url(data:font/woff2;charset=utf-8;base64,d09GMk9UVE8AACGwAAwAAAAALlQAACFhAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAADc4oGigbIByEAAZgAEQBNgIkA4N8BAYFgXQHIBuDLVGUcl4kgp8FdsP1ASYUGukvzRPoL5aUsd0c4ERYiIHJlYyQZHZ42ubf444jBJQSx8kdRm5iVE+MQgVtjAQzVuoy3NCeLku31hW/Ig4//skDz+X+jQm+wO+aHFka8ca05wkU0AYOdPr/T7b1/3q1915+qvU/NZfvdGOVIWLAlDBGgiERDEyIjjOCoXFCtt+4xhAnoXkimGEUxZQHMdA0YJPsVtqEYd/iFE7u9xNbBt/uthx/R9T2vqpikiFE3P9vzS/7buBMerAzAKTweAI5xo4w1dUdqNSpD1jJEGW4/2CGu5MhxW7YjhzhkgwALSDIdbtq7cpdYVYIDSj3+BVO7WNsifMVPs0uttnaqV0oIZgF8z3P7gBAQggEgBNVVFedXF9X7/KxFVc8zyhDAGQIZEGGIIohGDYIECOXBeM6aLDr1r0jX5Rg0YaNu5uBGMukKfKkrn5m+rTykbR/8ax/uJBNCAF0pqx5S6BQo88eM655AX0K/QZQsAF4gjCQCTRgCBwA02AJvACfgd8oKAWjeFFCKImUYcoRynhzXWVJ4RyWRRVptGWs0SpNjS+nltVUVhQVFzWpisPaDqJf6wjVZdp9jLnOjVN5mAgVwbk7Rv0q309JkKo3L5xXaTNPqqRXFpZ+r3h/FkJDUa1ev4bfgo3U7Q3qsrqtV6sQgrONB8P/ZQFkzE6/dQlUAWPeu8NXx50+Df+rrYNzOKW62KCddBWquPRqORYm7CwQXU7RVKmKO1vTSds8e+TTsbxNpi5rA2Itb+0uc1t50v+yfMeSfKeJYv598fjA76nympLqH9mgZKWrtaVC89/58AyTkm4D0gT4BBAEgm0AAimkBjFGUBcoogI7LPVmBYbfYBQNXZghf4Aa8C2lirKXoodN4Gi4Ap6Gr8Jvw78i6cgQ8jbVndpEfYZ6ooXoh7RoWhftIe0buunsTN/G+IKHMg4zvmASzEhmKfMa83sTU5NIkxaTfSYfsUxYmaw9rN/ZcvZV9jsclOPIGeE84/xp6mQaa1po2mN63PS86bJZH7ODZnduHPcA98KbwNvNu/Hb8TP5Lfy3BM0EpYJ7wlbCCuFjoW0eZV7xF406ElUvmv7/3zZ9pWmXpmdeFa8mvhr///BM3MK13nXc3dZ91F0Z3TR6fPSWZv+StGsW38zT7JTRyUg1VhpHzf8x48xvzILm/9uic8spLb/WohtqNU2Dxjvycz8Sc8/42YqBerkZ64yfBgUxSCE20zrTkr9M2a65cd1ojEWXJ1WqJo0K1yxd5kwrD2FgloKpL0SYypNXVjA9d53a9e20PhBRKfSFxFzDkEufIx5Z86rD9FYRdhYPx4lIPc2HDHmN65+8b8xgIb4HbAE4/L0vsTsdrBjQzOlkVXeGcj3euFJZF76Q+ALi8F1rvH/1Xn1u8neokw3kUKJTwnT7C0FsruGc3+m7J/9KeHdB9af0icGLArzNlbdh64LF8NMk3WPgmDj8hWv1ev7zc01fOz1yb/D6iCOxswTRlg7uR8BF0K1Q0F836IJIXFM4XXOdDs0QIY4JX+b+sijx0FRleDdO6WqPPJVzubEFpy4xenfNlbtQW3Ea8cfJdrkm2FsMh6LNm7fdqc5al6StbTxUnKyarcOMBI37ZOWlK5VVE8/16DlpslfPi+MCZlGTlCWTj8/9NB5PalbkqW2aPIIWVZIMGOh7tmePSfHBytYTT0x5m/YPQfNUW5xax76wcx/6JFmATiYwL7qcoz9l7Hh/pma2+otcC6LCIjtHG1bHJycUL5vdXtRavoGiUcLiapcduA7P/jFfLfy7oWAav3KBXZk7/myQm+hxwwikbSxZ7mt2npQWGeRSZv3k9HmqOiU2eLCvomA+2cgj8hJXhpd3jmCMOGk6pOHk6NmBtBdVsyYzARKBOHEbtyWCjgGBBA7G5POMgsljBNlQiovk18kR5WrjXNsa0zPAAOkyxVDAUGAYCp5BDJKQPjeJwWXgcR2OkLuQMoK0Iag2JrctWHqMEAo65JfhbOZEKw+X7uppcFDJsnjKvTtPN944bAcuZKffuE7mbzI9G25CT+uzKsu4kUyPp0STkqFMw7jfgSt+J8xxQmITuQThAtcguUIWSzLaxDXywJlIlCKNJJDohSglc3n6faZn71B2qvY8X32AK2HLwRHPfFb2+klr3921vxfXCxcnqmMGe+gajvgdahq/C22rLsqKax5IHDgKiRNHT+TN/2ySpEHtuKbGnWrM+kzauS8Di1NUbkMg0zy3LK9LjjNh/Pmb84wSS+oBlULhAU+gCEMXeomOUHInd0IpmPoMiCheeo4MlK+4UFIE0f1951ZLZnDKts9I9KtktnnG/pRoMSWONnI7vP7OiflFXPNaOgfuBZxgqIeydN3T7ec9wL5KIf1UivhsFzXq4HoKtDZ3Fq+PjS7yWRaheSvVesihSsNOVar+FJIJOZADSQg35s1raWlubm/d0EEdDtocgVBEIBLx8HWB3fkKWTSPS+40x/hnDrt19H72U3dZfmUCoXlu41ERFsodquqRE1mWUB7bXtlyA1Uey/PvFT/TypzZt4ADuZwY02dmsv3w1Kk35pij5lBjj95IAeqG7H45sNq0G+NziHN7z9ZuuyBuGyDR4Ni3x9fhmldVUGsZ7PEyEn9/ri+swJDEMa5/T0NhoS/EEJAaQorFL16jyIJbkA25cgyrbNaK0047jWl38TEtKL33jUjsWTUcjyTFlX9lJ/UV98NTk4lGtCmjHBNmTj2caY4bTx1283a6zw3repIFb7jFPDXaTp3b1dhE/Aw1wwrXcytsa/GWbcM0TxlW34amxFETwHchJ37xSHg8+5v0gqxDmFxHTmxRlxJ/CFiWBEHdl7/feRo/e4PtTo4/LscolEzInFJMqhMU/LAFPI31KkIGr4zu1H7oUJcuo2++IP4RBX/c5ejdmtfSDfA7EHzfedPSyb7KXpAbpZArtdAF1wsUV+CADARCbVQLTdmF6sk8x0r1ZlbICaH6V9jr8JC3FhhES09zHukiR4DhEMP6oR/SotQwoXuN6/uuP1W6wOp7mWhE6zWBOEahvUkez3t6PGM8acgLyxZ3in/bu8fkP783sE85yHELcuPq8RG7yxafuv7JX2heOFQbi2ueWmT5FFzmKKrDtwEn9BiXvkt7zvnJ24KwuK5ETU8+eGIWjx9KXZj/cvlpN8LAgjk2m6gXCGr/3OhJHi1bkh95PoiBvfGzipKha0Obm6PEjHeilBFYj7HXHj575+Y9U9sCS/TDcoUKaPsYYrHliXW1SlwxmySrWsajq1W4piyodzZGQrnjh3blnhdrDvCkjAIcWT6XbcF307a7e38fXpygjh3soReq+9I8vUu9LUxUhVwexbqMEUpIgwfe6X0ogG81/Vumwd8Pkd+hFlSxtB35F12Y0jPWEO3LQKxjWoBVykP9JtO74n1SEE8eKH/khRhkYaFZJRUO7K6FpNapKtWPcsT82vQSvPu4dIWR7XqC1h281D6Onzz44PCfYuCdzhJBaPu5Q2vwa0nsTM67O5bd9/+uDjWp374iU7Zpbl43GGPRlUmVUlN0ffU9ZXjbj70ljouBkhooAs7zGJbku5UKENjb8J1nhN96Gy7AXQocBwPmLx4MIA9jvvjt3brDxRhRhncna3LytQDI+VVqsZEPw0otUNwTbAqj8lHGRJtiYN8Xo3X7lu5+0+2qp3CqFu981Nxbg42a+/gJcbwV+1GR6vECbIw6gxqGAX6B0twwpwZLx4gvI0WN/7AwbpnJVTxM3zBK55o3gKUAWgOO2gD0ZU58EAWnh/dh/yC5qOef5tpv+vN26+JtF/B+FyjAoJhGHUerYklApwq6xyZNdzJJigUeboLyMvHi1vshI+ZWGhFT8/gtwu8oCDtVH9Jl/CehUwe7CUwAhHTkXQX/2bOLM8eNES/mWfhdovoJcpMv0SmCRqm73uiGCAjAq3lyytS8AfXn145AAJ/XOBEXlupDSAUYJ7tTLmVOs21vYgR3JWZ7AFCbQfJGHQ5y+JdDhDEzmHSetl4KlIDzhblKDtPPGMNFyL0ILX4O28mcaiqr3lgilgYjytATPAgvfME9ISiLsPaF4/gjp/oZ/wlpoi/iOLG/GR7a1k2MyU/JTKiLQQfvzL7G6Efm7YusZtiQC35uSiQ2+dIYbX1EVoiD7wuSgThT8yqZV6GRpQF8vJufi4gofIkV+vI57fPd1KCD+kIrpwOt0wrWY4S4m0ad0CxFkQOXA07AUSqFf6b/0hsoUP1HSipsTl/Y2v8ZQ6Zq1bZc4lOPhH3dLdBNNSeyF7fIiu347lB3dJiPmV0p6zt69xILjx9BTTj8R8mzcvE3/+gX8qHagJ5dHRXGKb7jt+85WmKw5fSvwYzpaXhf2TIScrpEv1Pxu7PQvEXeVUXki8/1NPWgQtLqanIM17MbdUohNMWDe6rqj0SAD7xqfSFcotS7Qs899I/m9CsgJOIe/+vmcZJ6GCYR9tthS9dKzCZRf0qcF5jm74PEmaQoIqZRxzWvBbzwjKwOYOWunovEKPWZwHX7M/YY1Rdaug72iqDraoW142F1Ts89i+9zFADBr89C4y2JeULaYQcmW8CZoiiyNTnSJI0+Ntoi1vzuniRqYgiF6Dpf/y8JnbOlHlJOlbG4Fi948WT3A9BXOYOWTjbqHuBJmUAyOBcRCMf2dZQDtM2Stjyk4sGpaV27JyUbIdM1XVgBR+FLwaqFKYCzQLElacqJgMMHBomekCIB+50VINDo8mDIiRxOM4CslJ66Ykh6l4GLDMjbUIqI27Xg+5biz4sXb2eFcSbsqArj9j1nMOoBlsv4D1sooyBrVuzv2dc7rbI42e1UbKoLCD865cekWmPJb22nO+bp4T1PLoPuglP7++Rk0AB/EIz207t1/SoAUbU2UOkOaJ9sKXPiPY+EI8At+GDDsDNE7DLJYkf80puu62fyf7uY2ya/b1w6z66kdj/Y0U3S9t3IlPj1fnBcZD5aVlAW6Oo2brb1oKSd99ONMZOpow/8MA8U5qUUzQhxwefetJIlChuTw/k/IVRwPa3CPiA0T6WV6wVlAXwccJZCLQmrjEUAAu4EgRnqRYy9LDqPT/HsPibvB0MtxwgtO2NVlwn+Ul85MRcV4jho6VhrTnn2OiWrdWbDNa9aG4FfZF5A2QT0XIiiavnecB5psv3RVofDZJITE1HL9YWFSDtk4KPEFcPJfTktv05OxjQ4Lb6ddTOiI/cM0YROJYeIN+GYhWW4uzpwPZdKDq7x8ynN0LybT36PRupDzls6CbVtyfRPSG1bMAAOd1yDO12DWI8UIzMiKVdLTt50C2dwixXkl/LSbiwEnJjGe5LeC8dlAidVQQts7TL6mXLwc4DeML8F+YH6KSpouwp4Y5ghq4YsOcIxxqrtnlgFCQqlo4wCugPi5Ee7WA+hUf5+WrlHlNc6UWTp5MtSTAQpyFpAUEg0vXHVddW6JJVCdL2et5ajKSc4+vUIOfDPH7DKoxzZUEoTXRDUuEBBsUJTei94oB7JpyjYcIaYnNkzlLIxzrpyjXLKHdvVKmxxCZX2qr9aRTVrIRMgGVecKsHC5F8XqReB03oxLavYUYT9w1xcISw5t/2ih3y9vL+tv5sO1kb+/Vf9WBFwIwgobewz8PXxcvzMCHbxSuEvt9HdiyPafW8uyRxtxaiR0MlEJ+hxOhgmmcBPXrq9g/QztmRTDeon4lMj6hKxCTPm7hItKzSwVPlWISeTFwjjjTrdE4gCl3+1+kx0Lgb60UWXbu3exeD2jOQiFfFdnhrdRkxvzismcFjErGE4SOfl/QtxxCbByNiYwbN33F0kMCgu5zze+9KtZd1V3kGHqkaqHLoiuywMazv9Csxg9Xflo8un08eeJJ7PswfvtLlCdDcF9SEJRVJSqA28YRkuBm27qa0IJmQYtlW6M3K17h9dWD+Kj1fI04U/zoeJ7nnjb9rnu6hRC60MEbOa0UlxZF72eBRGbl1aUZypH6W0Z0YzuBnkj5cS7Au++5fPpo89j9OgmP5BXlOou8OoIISmhh23EVPl8Fc68W2WvFOa8/ysmXCdTRib3ZqHdRpfUm0qgv97C9UpINznA6dMCunagpKMJEEYIGJio0Fe+EG+2HOgVY/VHUm1Mz8WGgZqFHLfgcY6J1AMyBdr0bzuC6pKQmMNNWKbqCi4mj9wYzF1Cq/eR32YHHdgGNYypj8RJm1rLjSKrIZVEORnnkzehXuR8FRlF/CRJO8wIbA/e+bncxncWYyZXenad8yexRqNEi9Y+hSD3f2boJmqvRPz8Z3B9XBecaYuCiOX2LbkbmI3IuTls6+PO09QiJj+dkp1MtZhnP/Rg0slIVPLzjB01yEnE9SGoXsC11rjsnE8rFzFoe1eT9UdOjmok0lvABafJcIrhhYB68mwJ6fJ45pKK3DsE5q3XLn4KoFFYh+wVvf0kLodlWSPvoH8VBeVDqEB/YLZzhTRT+iPSRurQqZs0964bjDgZSsTKpWmZbyCiurUz+a51khLAmF5P4Hxsl2+b94N99NfyiA11RJVHO+kRvSdNTezuGDqyYeG/pjcbwXJJjyiVLhMpZzUXm94LFx2APHZq4gmkf3z9zBtYXDi9Bae4QqOMJJwGqVapgKEhisBp1qM+swHc/yteEc1aQxmD6B5VfOOjxzqSJnzutJLW/88bJ58lbZ/q0pnQX9lT5UStndIScV6r+hKhAu3jutM6I9ftCnkaZzFE20JgaTFLdkN+OGM8rlM7M1FfbUtxE4RH66y/nWRj38Sxer5eciNUjJxeIfpXnffPTEGgkkfRfuEiVp1Oa6XrD4hUgrEgLi2nmsen0oud1wN4x1ATnySJeH5jESRgPhW4JAPebUjNsVTUr2RCk9z1R3ZIb4EEmCn7zwIn8mf8y0TBncqNJhaXjy8KrJ0863klZJShtdQ0SHeOXhv6enSovOoma3KG+klq8fEWqEUEJfj4HA5zVGxHbgTfao8mbxrr0VYL8GnWfJ04U/zYUL9IryGasULPS+bGoRW3HIK10syPHWeG0+pzxc2h83oeV/d2MZBsHp/uVMdTdRX0Q/PxalN9/euuHftPt6aJ87fY7uGNe9sxtIHfzldRXy3ma8UygBph6CMpfAymue2Svc5YOYfhvy64xX5hezw08ypqzDvzB8fnj/QmnMbP3+BXUyN+6uTm8TdR7XudnUaQt40q5ft7qwST5+gjExIvaBPMsaPofY955/3BANeWwKoOjXf6LDb1JBR63yommUNzXIiHvR/+o7vwK95eM99LH5RQVMxNm3m1U9NzKK/hvpRVPa9uhXdhZZVOSPDoM/MbuUFnpATdfqPAfWapCeCNtUQH9xzF/bc9GL9nUl/s+YjrrZ1FPqPtp2YML+6MhPrO6m81sRHQvNig1fFR+qfYDYqxPUhqFV61oaTl+1ODsqdtebayYAyDTjhEFXDCyHedvnHlz977Omf0ymihh+HvICUo6ztcBlZBonfIJHxduijW8sfiksubf/BELUs4m2IFkMMmauTpbJ8o3yGvEbpK/7BC6JgDhWkqpVQJBGXkYDzjKDa0AJSEA+KPxDE0XnaTJeZphIhcuBCzugn7/c19DfSZ2bUVxViX3xJubvE0oHs662qKykqO37tYy+x+Q+5R+4Fa/jK7y45V+w35AbQtosEEQTx5X3iKXQiBYyBw1iEL/+d3pFnk6iOdt0hlwhNbS9ygFYuv/5qYOVK5rJ/aA5t1Zc8fr+S8mLtX6f3BN5SjuyO7dkPy0KjgktK5FeiSw4cufdKAAEYlqBvTCqsHKzvqSE+/WBAUyEmtHYGkPXrKVuFfeLbZb07erHpqbrH2/eZhHqy4XepJwUvZlycvA0/+8t4/rIr7+NrAzb86e8nNgMsPHAiLZNfXWiwX/x7ad/IASJK9+QWb65rOFhHbHzjQHW5OHVFlpFmaJTvHflTR+LCXT0b85Qj61E9raG0tdAo97aCg4pSEomTjyY95j8mAy251EOrTsoZGSo4uJWlHazrryb0xwPv9RYqxeO2Zj7JNCZ/1JKUJJ61UX2oGtdDa5v3N2nECSkt2X09O3du4XHi2x8lVISA1B4j0ZzPjIdmoj/u+W3Q6oaHNhZBN4KCaCsKD18QOaUeoofWz4ksbvjcI58on348MryJa2jx//yRA4PKVY5MLAiYb16bwc78Nm91op3Zmw1zxOJtZ2YfuitPFJ/aRYw+Sp29sKpcjX2YOTa1QGxbVDmfhmnNrYlBYEPZ1jQ8HD2lHLoOxfyXNJs6KNw78NT5Rl7YbIy3Qptg+RVsTjk+BQXJ9VKHulFPpj5y4oqlkwNxpHGZjsSpJAFuVT7JNxEiLnx34RMPipbf/FU3ESnA73GIzIxBL82ltVwiruR1xBLEE/9eMtLWJh68LfXhEmNiZkdOsCvT2zrchhNTjEruSMDtnltrnQsamjJq6ln9tqTl3FDa6ByLocZzNMTVQ+ucrCRrTEZozRv9Jj6qlx81aIvVvyvUgoLZA2HDFRB2TTRONlHfCxVMS/4aJv+KEviV4MqV0JpXAhjFtzJWvBh9Lbkt8FEkflUyeIZK8tFRaxlHS+0kTtweacQ3KVJM/FZtZUCYKk/9PBl9dv3s1TAaDJ64dvhEx7jduDyYj0058boJplj5M9VqoW/pseRT59x8n/W81IG1b03wt5xa2mKdw5bgMp/jzdhLsLpixxDcN3IblooBidO6ocrH4brbjCYpnvHyBfZexWhE4AbFoHQIjooPU7VQqhrUonODKS4kyrJK4UVyFnTyKGo6yhvJaT1WmSD4Vftm1nOp5uN7xC8X3WSVAnJNiL2cHeNjphqWJtlQLLUmyDS1WO5/4A+TO4v/U+GspaOtek+x1f3lJh3NV7NUSFVDkWQO87OrOfz8EMBLBwvslbHWkCBWr/xuSohZFha2BWbT3EDvyxr7ujRQuD9x0JRDb58g4NCVTBvOOS1YARlPMLNjbL2SsuTtKLXkDZhAy9Rdu1z2QX4WcJHSKVTEiVfAbfhhK8oX4vJvuA/mlgXLQ/jKDgii+SZnQ4sKlyn9WeMDVNY6yFP61aiIYkt78xHEv/36XfiyBtsrWL8zieaXJx8AhTof5HllskiU9uIXAA/N9yJ39tL6DO76Ozzld4APUtilARE0TvJ/BikGjgc2y8LYuEBaDaW4yoPX0cySwYkQM8iGzsljhvXv5T7DsFQLNsyG1P9X2nr0RPaVQJklVTCWmP290NSiIaPBJfM1vhvB3fQTes6IX0/ySwQdFnbq1jPwWxOfvaQ1tC5VDbYzu1/Uj0rVK1QNbWNadJMCHFPH7IP+yIagZ/w08LFq6kvKoPkiH9OzaQZKpnaG7FGOKRSM/ZBZFjZDRjM9FN3wC6detMqJ3pqv+dwv350mzjWel3NLfK7pCQit+fcZAIKGaScE+Fl2EGqCjqnDOFaAvP9jGGa2J92RyhOblRdttmWUz7TmP2WQYyEHeA2XnGrbN6NFJTQBOWughl4iHDqkjP3IZtEYixGsrlZN4kdEfRKyv+Z0p4n6Omh/wilvsbBglzmbXpjJVdQ25GbPNN4lNLXoISU583AN1hNjwnZ5VZGGG7TNuGHwZiw+oYdv8cHa5BaEm1K+OrN65D7D3VaKxbQpVtQehrU6FaUzII5en3J/QoohyZ+nOzcy+Dwk0Lzm1q4rNQaol9J2fym4nqI8G4TsjOfW74eOI1Ix6HhRCXGrWMPrnNi9/s62wdT1Xi5zsO5E/bpIXSuMZgNy8VrRDXf5wJO12GClWA/h1ZPA8DuMP+xlx2BYgLu58hbsXLAYdobwXXPlyKNrfc83TT89KloSG9d22aC+BrhNb3S+KBLW/oBqr8emWOTEv5Ne8yldRahEGStA35BUWDnUxGkPr9+gV35L1xmycuXR/IavnEsv0ed51VenjRqU2PAZ537zouvvofpd+LPdx3SwtVQxjd6R/sD+wMfz6Vv6fSK/WGnmvrJo8ZDUgvCeKZqh9ppXmpvRbVZfufdCNqfcksGlewwn6OOKsyFfAHmlg0siYfjdYPuYn1KI3oCcT2fZPb4P2QKE8gVfBnK3fo1/++n50fk+PdBa/7nQUk0IsQ7Je7eIO3En49C/SjxJeClT63ML7Vl1+Uuu/E8iBz78+fYVHQjIuA1syejJhUf0Fe+R3Gru5rIjauK7r5kzPA/QAT6as488zbrwdIpLKJTv/bJJW/6Jin2SX9XLt4u5oL1+VMN2VAoxzyu2ZqgiMVfa6yeKKvaY7xXUeCLqSK5iZBjl1jn/T3oNV7vlDvIHg4fQ6Ed6xsaSvkZ3I57r8Vrcd31wD9Lf6KHXWxUGt+Qe91o/p+uYVXY8dG/MUHJjMV+Ub0VVjJDTlBHGOrpxhPEG86HJG3vmZ3p5tRWL9fDw7v179h6YYrFXHQUSCKLuhAAIYlk1pUAUK8vTFapVtFZntFs3x9+IyRIOzGKSMiA65Dt2xaxwcP28KFCynEWRtm5fAg3K3hTQBOB15kU1Wkj8xj9rqoOkwrZWAcvXhtGDQrDPircXoAOBG8RhzUeYBBmUmmx1xBnfAgpgAXOIAsVRByYlVYtCCkCCI5VAsy4UB3loUxyJhj5mrNQrKxVa+3BxtQSlaZb2PMPWoYihUdlHQkItJUJYUDZ1JONFA0Ewk51eTFtpyqESzYF1pdZR0ahTqsBbnVnmqgMTrJI1SO5pVEgVUFlOW84hULYWpSEapgSFggNLUTRsuJwZLh4JglQI5QkKjV4BZ0KA4QexhBKLacHPBUUJy876UeIxAaIAZ0gbbDYPEt5BVRA1HyViUV6HIEgxGDhAHkGWX+wNoGpIujIhHXdPNPsKEJStBRhhELQ/ZdZMYY4ymkSu1mxhKfAwf48i3AQdccwyCGLr4aUNQHpm8P9c/woCzhMoyz2RSSzToH/oMHJBLvioo1x91/2L93cP8eoXxi7k/Y4wSg58fkVCo6WcFFlf4wr/w7Le8AAkF1Yee44AqgGdCW+GrXDog4nruIh6DIjXmRqiIgoBqoqOHEo3IAWyVcK7iqHhdTShK9yGYOUDKQ9rUK2aYZXsBZWChRp+Kgu3d/47NqsECmywEcFKJPASkbiJYmUBIQWZiFWRMJoQAoWqx6FBNeL3LqMgCskoCpcgHr7IRzrqaMYA5pjNxiWO0Be4FZCK2DoMyOvjQLsDupOduQH17T0EiV4meqlIzMiZcmkL9KgVujgL3BmRtkdoAYfWGA2HoFppQPTqDI2CodUxg7fvANk5VXUInp9Uj+CMVn1ibwQ7h0qslijtySPVa9CuSaUKalpSp0l5cOPOr+UD9jPjopiupeTK2iNrAJY1XYNKVNP2ss3qyLQhnKM8FksNKMpylIkWpUvJkKG3JotKaPak69XRhXtfuj21hUHTsa6CPEAAAAA=) format('woff2'); +} +@font-face{ + font-family: "dm"; + font-weight: bold; + font-style: normal; + unicode-range: U+0080-1AFF; + src: local('☺'), + url(data:font/woff2;charset=utf-8;base64,d09GMk9UVE8AADh4AAwAAAAAfkQAADgqAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAADYHrVBocGyAcKgZgAIVSATYCJAOFEAQGBYF0ByAbb30F45gp4DwQRb/huCmiUrUgUZQMzirg/1sCHWOHakdUTSDHyhg4IaCSRbkq2MMpGQkBUdEOXQbNNbZDierW6lbrTK3brVNaKp277941w9w/T/BQC1WZxH/cwWYkOAWNxC+Qns8hAuhckTpDZLw38yO09BHv/PPbfNj2lB/FxuLd7d5FtfZ+mQU2adBGsSgsykRSDNJigpirdj86nh5+DE/b/HdwIFkTEFTSLBQr0d4UrFq0ujIiF3mrdpWudcWq9c/v2kViLPwnmup/NpuZ8XufpX/1E/l3t+SAE26e4zCVgFywUwKVOWlcBIdNFDBQbMnrGmhlCpyZXLn+8/za/nn/PWZ+nhfHBN+MSbaNyldKHMqhBe1njdGDUfgZO4YwCgYDo7AxQBsQhCEkHmPkOtd9+W9y7t/8+0IOZeXGTcrj0gPgFK5NIenaZIBCTcidJ2Un3aAHDn/v7v21WphQNIdwYhEEFFA8gROPC3/3Jyj4p4hus1eUUGaZhFGWSGD6DcDf0R9uwMYN5b8/P/x/f/+Dtfb73c95M02YNhEEknEiWQgp9X820/bP7pyt810hmSIHgF8v2Vwm3FR5KVczK1wdyzlcA6Jwbo04Z/kuugARVgRdKsRx0yVFl6JMSqIu3HdJ08X//1M1E2QDTbPELOf+rNex7DYd4LWxqeQVCgb7OKyFkg2n121j63AoRjKqP9JmuErfFVmFAXqHg2wF/80K+uEHFpRWzI0qKS7Z8UwrInoKQ8X3MfrHv48Z+zlW/7I1axzR//1HldHqKcX7r4gB2c9nlZm8HdeQ8K2HSPDjYW2KU6/vlILCX7nfKOfs+HAAGSwCz4AlVllvh/2OCfWKWCnoCvAIVZC5qEmLTn0cBl0x4bpSFQ6p1aRVm3adfAOG8yvjADKaEVOW7EIDB7KOLWxjL/s5xHGCiCSUCOLQkk4us1hACfUo0WKiDycjjDHDAHruM8hbPjHOP+fGpa52qwcMNsAQI01QZ4YzLbLCizbZqdFenbr1OuUNb3jPJw752o9+116D5MIURMiwxhF3vAkklgzy4MJHSjVnkaOgFS1GnPi4SRUtdBPnCKMqu+XVKUpUaKWjHvoYaZQJppsjyxIlVntWuUrbNdirU7fjXrPcKhtttcu4R+WMzZ4z3JhEGqs4xiO+eTkxSU5W8sOOOGcjjzLt0ccSW4bizc1FLJcFoCCmiaBERoqR1PhdHLIywKHgQBaP0ktzUkRifpWLdWxgE9vYkd0UiFKRjg3ZTFU0rGNju03OKGsiEn/ONN3pMwDJExvbt3FZYqVNLNWvUjxF8+5CHn9kLdKa2WXyCPRxVeEBLoMID8VIbMi4YD0wOiGzlM1JUuIFFkv9TSYpcV/PxCaFpi8x36H7uGmISLlcFFHBTDJXpJYa4FhA5TA8tpso8mNuQzv47XmU+t1gpWRI9xM6XN+E1hiieRHjKaRINriz69vO5tAQkrx5aoZ55Pet8seL5i9aDBm+KRkxpvmroW2pFgvaJFpbv5bwMtp6MxLM0AFDurTtBLhOpdZNKH7iILkrk4pdQ0+nIUNJdPphGgCo5WUeKeoKSTgIMq3gqUr8ztKdO4gYH802NKBqJEcf+U6js3/sKD8Dbbz4GXSRCprgeq6nwIKPBbSZuYba9BRNr6IZM7nW5mq0hmeYe8nQoF3MwzgixU5rdm2hV1sGliZ4/qXT/GLRlosmwLesE32CDrF03hBCwLC2zvB+tdnafNR2btU+8TJ2sZLxCSNVSWbjSy1yp7eIZVNc7SiONlUaL4Lt5ZKGRAApRwANlNPZ3/WFRnyWHbPGRSD3udjGGHcxwUssqNkBXR4RSamKoI0dbMlmKjKwjk1sYV9qVYW+jR3syl6Si325lerJGGKEFVlNMa3UlqlaHVAQcXoA+1qr9UIKJ1ep5ix/XEJUqeYaUyTbMjfg8UxLZzlRne43Ic26oyETPrlhkGv7dod74eH+82LI12Mkj3OVnhRh/ptCnP8lhYPTm78KibQPfFIvSW6RQwEAJH3kAdkAAuQCZUI9VuBAK1igDoyghWoQggDOQRnooBPMUAl90AUaaIZeqx5YGdDoUTOsgRoHW2VXVtWOGNTC+Sb/m2PVBk0uJ1k1blCA02NLgl5wgMszCOBuMqRvSN4KR2GsyVmkUwWm4Axc8pwvcAOuwwA8gGtww6o35m6Xtwa8bcw9gHfgGTyGD+EpvPQMgvXzgOkfqC1IXciXl6AD+d6xaWNeGEte5aaxgpJuGTNirFiSEqnZAvMjLUhbkHZaMG0MPQrIYAEIpiqAZ84xbL5IN5VfMy5K8Wa8dbb5yKEsTUP+X2rXfevK9cEW2Jr2TTuyv3/MOja3bS2jyZqxPWlHemrXdld/uw8MOe5j2ZCP4fHm/Pf5nNwpnw9qbL1KVooy1GjNVu+pphXsrX2HR1fRtXTyVjrDx0wt3vRbXszM9F3r9Sms9/tjp7DdX3RV+23tq0OeHZI7HO5Q+3/2HZM65nYsG9+j0/ROBzpVTdB0Pmvzdxt3m1k2hbY/2c6wffr/bl1OdXlzks3v2t8P/l7h/EcVrNqltkIXqfO7/tZ1e7d/dAvptq7bk1Pdu4/rXntaYo/q01f16tLrjJ273Xa7Nvmt9q2T7R3GOBx2tHbc6tic2t9pn7PKeb1z7TQfl+2u7V0z3Dq6jXWrnRnnvtPjZ494D/PsAM8Mz7dzQryM3lb7bvC+MfcXH63Pft9ffPv77vN9lRbqt8KvdV4v/3X+bW4vB8juH/b5sc/WPuULPfpm9K1bFNhvbr/Cfn8tXtR/Vf/SJa4Dlg24saR1aZeBSQMPD6xYFgzUBd9avizkQsh3K5YM3hj6l/9FYb7wtUGbwr9ZfXfEB2vOjXScWj1s7cuLgme+sihk96vU0GWvel5DYabXftm08vWm1//X0KPeOzMlYk30v1HXxZhil2n+iHXG/hqzMHJ53P1zQ1FB8Q0JS6KTEuoTrp2/IrEkcTgpMenBBSemzh05LvnJReelaFP+Tnwx9e1L1qcZ0pcn6NI/vCwlMSPzuaSdWauSx2RN0sPpZvr/aQLG51dfkf3WNaekHc756trJow25P1x3xpgtee9fP2HsrPyegiXjkgu8hfj4hYXv3fh33lqmjjnMHGHeuZkmprOfnxTLbnneUVz2IQhgFYTMAj74GL06i9JlICgJeCSpdAEENzMMd0rJFY2Omk0ej9DEoAtFdMiwx7OfYDuRe+0kcDNUBJTlaQxYBfj9w4c+/X1y7IfjB6SPZIYAMrq+GmBoC5ig7i+6foMOfd1512cLi6ft7vzRIkVJwnuD/Ux2CDNDFRqq8LLJQkzcRiyu0vJhotCko3/3Edoof1e7G7949603kxAiN8UEhcqCF86fOTBzs1ypwjM3yZua8cwdcrUSb/refWvHZCndCouLxYOb1w8dtCXa6n5kes263IZctHZ+fbkY76ZYlW7NXtVjR50FfBcaemzQ9XUKCfn0DOv5k+vyMvHE7bl6DvT1KdnQ1ibm5w2f9oDzp9U6G6pLK6lMM4sD37WNjOMXlwxL7bCmbf3+f/lYH7g1T0MHyWNlUhYC6dz2L3/OONZ7XFJwcP7YR82/+N52GchVzVVWtkEI5Ql/QXS5Mt0asg0kkivnW0PGRB9EmMnNUUHKSgRip66xRJv2Fz0UlMOjnIhRwShZ20SE3kJvl41dBiFZPTK2RPWULyE4fTXPVYysOq0aH/2dNl6yFUHZ3QZq4S27M5zocpbV/VEYXCaXMnVK/Nk3a/a4v2mqu/3SK/BOjbhx9cln7WxJOcuXqOeeTB9joYQ5RTlHbIgL0pmgDOzJCQ1Ly9nVf9BxXw7UDMGiz/6dOcTX78c9bnos3e+PZX7Kxt47qr1vNQTFZZIcQgvGEJRAaGPQzKKUHKFn1Gz2jArNDIZAyICUGErJFno8Jv9cIjOdITz2DLMqqnOxObey3LpfUlqI08U2Xpj3RAomMYazqxe6FQooATWykc0fbFjpylfHEZjbBVvY/NUJANBhTzZsO8eDgOshjpJX84VZYIRu+OOmrVM3fusX5KnRo0+xB5lnj4WAHokehGKj3LVD/xXseettxB7odKtFJ9sYBY+CXt35i9hjHO0betJ85f7xQ4cf948KuJlWmfb4zCS0HVQjn5BenbsMK6ssEdJsi7V4TTIlHQeY0CMZnlbAceeLnW9XWE9KgdqNmhL7kR4ikoPIJEggBR5glUPBSyKylCVM5J/DCufFy7TPQ97RH15aj+5+Pt27+9cdNellBt1ryAnYf08HqwovkfsyRiR2mL2r3WjD71/wbTq0V7w7iw7vPSsUVlwoQhPvSCYJai2EoM8WC00OUd8Kok8yUV2MqDyroQaqBhsNehnV1ta0tq48eIb30JCDvk9X/eU+G5eks87kbbeO+Xp5SUmlvHTErndA1WFhZ49dj2uEGJb1QIapHTwV0hkk5RiZzd3kJYI7FJN2Z5ARhcO2mGt208haLmPVD+EYk+zBmJL3G/q9gb6gCec5kSxPY1z3ndfzN11Znsa+z+RiRTOeDN5PbmGyrZgJ9QMqhjYILKyC0MYq3KOUHJGnzHKnsysao7baXIiYVBYQUQ1Bk8BxG7XudT0uv+aXU2WSi+drEH/8wQMxRKEjkXgkuWm7xqwygSN7q0UCqSw9TaKxS5A2DkssGKq4n5w2soHqj6VtBT0ubfYKvHNji2rWlF25fvnThUfGGhlienF++HFo3b6W9kjN8q0tu+LYMwPMzru5u42YDwNfmkHMyaBCfAB7DwXz+XRR3EZ6nu9jNfJqK1S7HtzcqPa9ZFCZoMNiY3pUAASYLgIVfDqZ8BwRp6Q1PST2auiign928l/xpo6lwk02la6Vwk/ThgMwbFTzZ3IkyTIKMxSgMf8gDJEMKkQHMHIoWKX1eoQ0Z++61qBtEdyScGI27boqbrmRCl3SSsjZmVI19KhMOGUmT1hcQtHiaAEnqOwv2lTdBsr93JLdV8NXhruDImO49FJYdJ/I297fYgeF2TUC5PPLfmF+PGP71M/5AdRVbayB6lDwYhE8tS/VHvjpzi9CG0uL+LlAu7ar344Xr/k5Iiu9nFcOC8ZhOR9YVL3gdN4ZJqdEnICmagN4YdHXAXe+keSxS2SZWZIWaykaNRn7HYarvx4U30vmC4sQyscpDiIVIAGrbOUjeBRLGVnKAUF6Jf9PwnToFxAE3sQHIWDcCPMrRLfS01+s136FDZW/jBPkcE+FU9rYV1JhTFoiY50NCfvPggmwA7+//Q0cdlNETgyaMQK4JJ942+UfDI/rI3R+tolOF4nodFOaPb0hA0FDFmQatvEcE8RUUM+JJGhMYTrTi88BHTQwmFaUz0wwzr9Tnn1bMAOiI3MjvP1wACGRxSbEhqBpUUgCqhpsUCZVssIlseQaJExRclMbot5iB2EexKqp7Kp1L7+es1HNlpQbSb0Fo7YDXQNxG9fNLiLGyp7k5Mqb43qnAvx9UWHe78lx7CpxBhNOmMnkJ+ARhSKvEF6cNS4Iw8emZ8QL2eoOPkwNxGadmqj2AsAlh0cM41bmx/ehhNzqNj2em621m2Bejtf+Vrhh7SBTHaZ/gtgDSK2xcLzZFmRUEZoKJHQXfM2M80dXFEjhydlZhngwJIb9MhvdXSXu/tRn8oHb73H8QnftkvPyWkAoxl9NUyaXro+X5HRfy0ax2Y6Sh8lNf8zkfoeSHojoiXxOIoiabScwbhkO06XsTL/AJZkE+4vr+yCpeJQi5+XzmOCPaX0zZb/2fPumUn5ZjZi+CtF6KGFWa92K4atn38JyIqfjkwI2RdDlYo4DIwXF8q/Odc7jXcmfzew/8NJ/u4WqFWkDT6ePiyeKVvQLNkUR2MOHLiha887JzhdSMnpeBeSYYkcOLpbDj4rzbtnsQ4OUxJ4dM3hvYOVgvK6h06kyfBerix2Ski5SJJ3Cotpz6jJunPhvtoxm1hu2Z4l5XfYHTr+2lBe2OVx3k5vylrioaVKdv4jPKi7tsophXCgWX+iSjIHX9ZWtLiYUzM6EjBXnDBcVMuGmvZoupYZCpC2K1a9jTF+13sN+D7XeeTWBG4NY6AfzijSfaHULSyBw+LRcc7PGgNDos708Wv7Rl68QcNLL+FLF6u7MhrsLkmAr+VwLt6htD/CbSwBZnEPmvMrFHNwn8/u7aDEvuG/P4EHsbzEXCz7+qd1l+nKdWGMd70RMA2xiNrIyaZcuGK+40Jj9U4KHyS/2Wm9no+i7ImGaqDoDhC7qJrDuQRQVKOTd9D5DG1NKUyVw3Sbs9F3xtiTQYyjhWMPhBxl3XPSqQOja2R1EY/o1ziMTSj+EAKLs6t0IPU/Rzu0t8EtdEkpwthXthUyIO2zs4wx631sPzpvqjQZFMy7Oct6yQGCfFvIsFZCQlpSb1X3zPsd7rLX9hYpra5OZhIPWgj3GbdgbHNVJ86A/ybpekhJaMUjBJDZ0FqwwbuOgYdNLCQV0wlvhZO/3XvgomJioWw5VAA4e/LygDi/1BPXV1fa6fgpu0vDYyZDrYlgD11w/hXvFMt9pbbmqwYj7VzETl4pAKrKQc+jIbRuD+hI93uebPXTJ7vDwtPTI8MnMWff2uKlzLA6HwWK16Us5bCmfixjlodDCkQZbpJ/efvECcV0tmDdbadB3e2w+FKSYGFfKKMQjL7Gmv+xT66yQSRzv0dbAl7ELgkaeii6znyLiuCEFiBvvXy546rW+Hm0vyN5TU92BuHGewSKeBC/2XNtjQTw1zINx4yhhkbgILFyi1IndL4QDYr4xBTCJBE8Lrhi1cc2Q6JkEHbK0LGtawZ4iastC0iwozPAp774Vqgt9o2tx0IiSrOytkAICiX9sPdSbwgm1uyy5UmtaoKvuA+XIW9MUyTSFosRqcXUhYUotuJApvkaN6eSaPoUyWclsHrzmVHWrWF5rATIhkaoIXLspgcDG78k2Gwk5lBTSe9uEPocQ5PyAGQUgFaQOtKs2M1pLRHhFDCZiucFLJLbU/j9VEg7FAYiAYQyYQSMvgsio6H9rL1/Cp2zUVvWCE0c6BjvO6epa0NbLXbdebUltX75gYer2VVpK5vPyCxdxRm0osnAYsI0GXkwXZ+tapg9unVqxeWtTS3075dSjxrzCsrLC6sZaxWm4ILk44byM0ipUswoqwQpzOUgHfQXvquh5eOwHXnQ0WsCjmYR3Z8JbMtRXWRId1fQ6MT5fMWlbBrlVh9hih1lCQ87/A0Z9G28WjFDmtPV57CxbzDGRJGNH22qu1EamXTAximjLBKFdYnbQNIoWb+UpRAsmUeHETohmNwgCLRAiCcJggPBJReOg4RTSD49Dch/xnca2s0ged5odVoNBEpeqdGmEDMvQHcwSqJD3eFZSlzTPIIKYuilj9JsXGw2V1issPCIoTjKDj90+cv1AvY+y3pt7g18MTSEoB/ogi19RNfekessajIvj2QbvgzBM5t9jD4eCRhGJRZpE/IthTEHKHPdo7RY1ibFbWsYht0Vx8WvnXSeD0th+2GUGEx35IACJF0wVHQYhGZbc2Ib4xh8rFDM0mrgIWeANQm4KWjyqEQGiL6Zotz5KCEj0Cvruzv1aG0eBb30btb9xtMZn527iQJwPoACjfkKT9o3ZbEAzJB6bEKpPlM4QQMnCxKjwEfu6E7aKGbTT/NfzRWyg3MdAPBjHd8IvxOMwjAHFd9LwMaOntsvNBpwHSIlTlDxOBjygeIAAqYrvbEMUw7dmS8TrEKJlM7FMEoEnCUHDhhQ0YPMpUHIA+59Oct7MzUOTpyPYXddChDm3GekoLh35hTElEirl1At2Bst3eUxPxCzs9W2gQyJKLs+Wmwz4dlLpCZwUS1DSIUlM8BLFjA/OIxnket+uD1mwQeZ+Q+zgswISSb/0f+OxlghiSzrIXwS+pCpmiFBvhj5i4aHfITjxTnzCBpdShUGEm4IZHyQ26c6IlHrTxy8/eB7WIfFQ8fI//Ngf518hOKlF5tXtIzB/gpgk3/cNcpoykbgLKPzaY6mV8xB+CcFNcmpDpLuOZ62Pk3Eb4SExpHAsMSQ+jH82RR6QdEwPCHu1r1AZmrG2EbubreJuchf7upUf/icZniNgWYKv1JMonaFk+hZDarWuAMYIYBuDmQF7pD+VzzevnrVeeXjeZZeHLEaDgVHNMWzsKpSdaArU04W4Esdu32qK42Sv3WQQakPiuNKioAGlHCooJrdygt7vBImwG2L1tuDCWV7RyX7AHTz9y7MS4fHUS5SjRTQoKkezgN5PaKYrsJZP8Fo+JwJq/zQxC7t8+4XRENJHNU5qlRdLdAfio4mtvo2teET0yMcjhfU65f4kh3nTYk3R+VIAe0UD5yvh4sG1OXi5cM6MKPgvhmD5iMrN0qNg+Vk9YPiHoZ73aXLmRyqe1JDqpSTbRwN6kX9YFIWIdWnwFr1KWKuPpTlRflpYnvJyexBzygc/gLKVctqXOAtq2GHtivUOu/iusCb5CQHLJfMEIAl7HM3mo3gbytc8wJIEoruCnPlkACfLnhrzmrW46WHPvvQXx4t0ipzbJ0H2sIou+LigIP+obgBuYWhnyl8BjK7UbF2sEpxUmaOcd6fcBDYMrWKCXWXhdibBDOg+ML1LOjf8gC4dsT1uDIOyqIzZbDwtgLhPq9pWfBzb/HVZoqbleyp8gNhwJOGimnPGPm3akRd2NEq7SxXf54riw4NlUwK0SfvCEru9JYhRsoEjkAVw1Asa0ZtH1BJHq+Kp76GphslniE/qpdCT6RUuA2gKe2nseLsl1sygB9PMgY//4UhXUIdvOkohGm5GEVpz8LSuXE9xUuRcpxYW4dwkz+vXNPVsm3vgRvLr0hWT1XL3yNjjCp5zQGK2w/QvjVOFVdmHa302XmQSvGLwxlKUpMGbVaxKJy8u5eIoq+HpkpkOI4S85ibQfcmW3/049n8NefxE9k+PXe9s6zT6q2HGM2pm0PqInAx7rSfb9/pBRxTBf2wgMPercuvPfrti+OytCZlbMAEpZtC3st6nguAHUwFjHXEjrfNRpPZ9SLRwrYBS0gNcx1GsECSU1MwzBEQq+yNnixYS6THF6tcwOqZWP4cxttfjoBCU+F3Okv0s/TloBdAypvV0LGAaTl7/7WW5Z32cv7dTwWIbzxLVxmtndzL9KYmC8Uz/cpl+YZV7dpnWU9tKZXkDQ4GafmKk0gNu5W9zu1B0zuPHR4CxTIRm9v0p0sRU2VMlTi7hM8CwcOvH97mCpY3PsysvTB4osrSeFpHZVWSyiEzYU7uOd19izZciL3FRNEke+NFwdj1iPiUSkpjerAOjVikArgH0wc9Ox6MGiXCEnAYqu7DTJCJzKEeMTEo4uuvN5KgphxDGhV7cL0ojAQvOGeiiZqK+yoK4xb6pi+l1orhHFyOG+yTBf2wkMPepcupF3+0Zg4/DONGodkSFfEKy9CodXW9Venjj2QjgaYQlasqZyYobyXA4avgfxCF45ufJf3Hsur2U6MqNyqIbCbwdJcD2lS0mMrPWuDv2S8q9dp/D9JVa02yVDuLKoI27+gWR90ISsj1rjkz+BxRQHDf8F8JjD1Rdtwc1yW1dQg7hkwRebnVn575wP4+F5w/w8Y6phLHDtj04Vxe1sMW3v4xlElSqrA60Yw2BekCVmwP/W5h2xda/1cUZZMuwYVu3LXj6HN1gz2zCLrfxGzf5cM/EKx85GEvhEUXu3Wmwhqy+ETZd9wB/IeQO2/LgXGN/Zgb1sgUsH8Z+sGo3cYJI2zQJ8qdUFDarf5bGKNi8Xt6F+DDSLvtr8t+cDIVddB9SurRd9d6S/Cb4YX659pMff/ja6aWYOmB+L0cbPOQZhB2KV8Xen5y48W6o34ztHpjkh591GE0zHYkrD34umXuaEbjaCrYPs97gAJ6CHVtofD2F+YoPQLiC2+fPQEVXuOj5W888dpwUW1Y0RSfjFHjVDsdZKlGUKi9W7nvHf4yoMJKmklbxFFpRwWg4Z6qYwQcxVhZHgR+4rndwIAKDXwzTQgWYJeAtL3G+yECgn6IMw1oB4zg7yFgUCFT7CbzkYnLmKCJgiT66ixGxVpbxiGCGqLZusnCg1xanQlvYwHp8O7npYs72cNfHgv3mK40+6XbtVMo06VVp6PRZR4w9nbyR4k9bLYUTHZBcWFs20nwgn1tRTcZcCmqeSz6YYjmuhyzydrOcgaP7lwlvL+TOMYTcqXeqn5gAhbySIctWLBOjz5EnT0z09Ue07hBFTaJmsjdzzGpx8lOjy4t7/LdTloHplEJKH13Ntcwp/YA8PeL5keXAZER3/iu+jbbnaLOJLB5JjyiML3VV2WLCgIGkCuStLkEfCM3SoNiBtgRtAkAcdAfAPHqt/JINXejbLmG5gzHNDJLZlXHsx82cVMdmYF6vZgYMqrUFlGeewzobf4AKzCZ0sGTcDU38GzXCwv9rB0XPammMKbOmPHVxPMRqQ4wiCuJBFrOgawfaIQmE3S6StsrCIc8WlBjZ7C3TRZQy58sou0rL4Q71/2Ix+Ru/NA+mFpMrzmWzEspi8O+ZuIORxvIYSSGdSRsQP+42QRE4Buo9DHQFTZZ/ITaoe8RKU4DZa/drM0ITlHgEKcymsX46eoMt6lwx2dfxD3pAENKlYXbCAqwKfW65R5OJTGQWm+FgD4aJZDCnRG46JMNXFPH1Ad9toTKmi6cgcsTjkPRynr306GtNUoRc5eG5Z79CPhG/bSY84NHaDb4jbqTK/co1LGWfV/D57xzRn6r8us6E0miBoUz671q4RfINRX89DqHM2LLDVo6xV9VPRuay8A5/924oLmtN+5npQ8zxCdjtOZOl4QrB2tIu/ZklFqsFlu+GYCgO+MaSrYNOmLFIViarpoD0rtz4LIYBBTsGwLLlvH4TwmLh7VXi/v7cT32gsXzVpI3q+fPqpYDmM/YswrnvQg8athFuzm6jXAd6KzWjQotfA+FxfVNos0TrmxmI/6d3i+jTfM3xpoM4U4HW36XlWjkgZpwGmB6DxW4racYRda2jVVduQ27RmuPMFppQtUD7RFnfOtGkOnJA53DbVdbCdp5px9KkJ3EZmTdWjZdT1XZLcokINxKY916vsMS6SG/Uzt0Wo2frIsm09dSCaUAVgb0d0Id20OKUgmBgpy0BDaY7awcrC29Yr0Nfeclvb08pYVwyeb5HCNmfbSqnwuguNUNDA4L+sirbbLzOoprDP94rdiOfivCQFGkGcAkrb3eFSK79QG7owGF/HUw4gdWzulGEE60kPgATYOmQE97Rt/bf7o/w+3fFbVAvUdJBnT5xhWQ1WStur7kiHC2Wih7iUUQnFMsErIJsPor4Eo9vMXOXo25eERZkPE6NGiUPTxcRyvlbj5LGzSWdwu7/p3yor7Y0PSoXQ0ttxPGJkE7mBiuw9FHDhNJ5MHdCGIAbBJktD0EYbHZWtyb8i+FlM2tn68f44FxJy4LW3EDfEfrwAcgI8w8khYSNOrfN5byT3Bg2b38HsFrOsyGKwuRI1oZVCPnLXkndeRvpCb0sNQzieWFB+0Ipz75wKHl2af8VWhO1Sn08gI6SssxhCLWnSCKpxsIvbK06j4STfRrz3kSg2xoiM++N6fP6kQ3O0gPllMh5rEk9L4Tg06OvVGiLiiBEPHIk+dtZlF578YNz+ozPpFTzsOGL/uBeM9YRWXUwE/SSsq+8LsinRPw5FJ8IUTdVyffKUhU+hstOAgERrztdPEtR4udQ5wtAdvIPhnTjxn20JKjQkm6/H31ZtLflQXxazffrdEVJsf4OiUzAehnKBQm1hbmjG3R9a3DvUhWGtvR2GEtvtyZ8jxU+jMAo7xYiHFwLvT/Slk6vK5XxKfFzSzjZYOUqeVM1WtDPOuNz7eVWULXvylc/o10n2p3tLspmmfnypcbLDe8jycPi7QSUREAu8teQ92byWAf5evg++ituqsLI3H5ubWXrJEfJoVnF57q2QKJ/tkKBhERw8/2FL6cSKtSUs9MPFybDDj62xiB0xqMwa5VWnJGLsCL1/ykKPSaUHg5oYGLuFB+hSj388ebP95OjbhU96rkeHW52r6J1zuykN678yIHNMXlqJ4Gbd5TM7wB0wVM392roaAUby7mB0ZoK4nJ76ELAkbTo4pnbjWF/70j++HWdsBtSaOI6oVgvEHsvVEfssfH/urlpikYvgEA8BFlKAJ0kyC7FfiLwo1F8sTOAHKqvPAwXsTzbet4O4PmjdiyvbVOu+wrOK88dJ1ex7U6ugg3m+h66MGcFzc6vnbgbBuoP6NOuC+c6Yck5lD2uNeV1fMZvI+bNb1RWQKjvy6ha99Fz7lsR6EDoxVrBOmY4tBlALaUbkUwkcMgdeSjl1CU46sxIHhB3pb4tVzBqufGJcsQR6vgoloxMzoIfWGB0DG9wiWmubn+qHb6xiJmIITv63kf0pMAZAr8GRltdAnqcpFvUNbmiCcR9pDpw6Qa7SNKFe6oqeGs82UKiED+AmQ6J3P6SjH5xNwrnge7dKUXpQ85bU85DoMbf7o/j6QRj1on8JgGeEpOvecdgjw2e/tZhmT3812FX49t9oUfBPQru9HTZfgYgXux6tHmTeAqxAj5/DiVxwoXrHdSxUm0iVo8GD2XNOEZ002M23obgSu5fGotc0owi0qwlhGOwUKAWzaaUl9Cavu3J5rGYo+62pEYZqbNgcdM5D87VtRXXroAwUNqPO36wEM+30K3cj6xuxKdN/NP1uBJI8cppkzWtV5euTTGGA7Kb0YNCXF5IsUcn/xt2zgrRTQ+Lb1jvQlSltDMyBEG2jkObwAa5rKnPhSOki2DM4P0X43PIaTD6apWrqm/BHYFiDkPwr57oXP2E9KP5N/USnjG3HgcL642QYGFbvUIV8adC17kztnffysnJ1dzpziCowwk2bybGsdI3I7UKv9usa4P2FQ0T1Q6yKULgetzwD3VOp4w3pf/3QxH5+ZVBdraEP+uz4QLk1wAHRtEv/Ijrblm5aFixbOCwLYUzmV15nQuUXgYTlID9sXzsP/m3w7QbhtbMjO+hDdzrnWQfHKpxOY/uFH3inESuyuBE9q6A4CMTDKgJwqLO/ZZ9hA9pXx2MXLFEHYiY1T2wW5yCv8vJTTKEA7KeaUf27OwzDJm7RcTRw+KbNrgQqZV2cgw3CjiGVg7GKxp7XLKGpA1Ea2u0nwxXW9eVLR3SBN462tXLn8LPu/Po+PgdPCNVaGvUJTjE47cbzkUb/BrCe4CWhL8fI2Joi7QXo5igJqhbMkwbbpBLbwxnboSOBCRsctRwUn0fRTUNMeT7m4KWF2kq01d7mND9MjZ+spKyxmxsILMuP3uxqTdhOJmHaWgxk+vWHTqF/nEp/qUqqOtNEgrrHMp+JGY8hNZ64wMZRneGLNpWFm5ts95Kbs63Q6Oz+lWeFSrIZ0J0binjeLJzmNLMzo71sXQcud/TEzlzZVSUKPMln5LfaXNiTuFaGDu3TRIypyhTzJTJ5Ady6p5dGChS09sH0kC9Erm74pqVEAZudl7Mlpx5Hp1v48ndFxV2iltqmNdFVZFy2mztiMeGKwOjs/G/ZTpiqBScm+DDtdig+a8u3zj6arlPBrXLIXNPeO83HaimDqqsla3Ewfv9u58/xRb5F9mOGJPQNc/CvBirxMehAcpDJ+uvB84VO5QxYlL3Qjl0DgYTX3/BieydAcMOT84YiNz8WUf5rDYTA4tWlxSZN1LVSKwVJuVAqHGASrsxq55up77JN1wfTQgbVAbv1AVPfSS50T0o5PBqypeP/uaIGXS/BdhspLaNPhYo4ydEI9ek8i32Hq7hMFbuDGYL4aHq/95IF/QkQbB47rqpisxrA9/yvfu7ECMaN5tT7zdVa5X/9yV+FkkXJcN0NimU8yobeHGF5ZNPmCpgxk29C+NnjtbGMF8GU6eNrETC39vUo0wbngldjY3cmW8MSgi5qlAhakXHSaeLXcPlUJhrQ5bakGMBP/nFiYjrSM24qXu77arKgGS4DJimdkw48kanS27HqGAmXIK79ht/PjR2gGR9v7IZyx+HnNEzncNUJ27JiX3WmEuHRgXQtbv5A6e/7spE8io2cREmK0WXm5GyJSEiC7rEWFAVJsklMfvg+sofKroKC6pSRtZgUxyswTwSOUiNz7FL/njMpUQHYF/oJkxzlewLSFhdMQ24n+Vjj2/V+TjVazeF3qK/Re75QF7fgTYvVtUoay2M5Tdm35w5ygawvHz+9+d1OvOPlfVXWCmWr25WAheGcvKfrHSoUqIytf3bgotme1Q4bA/mYLc8BnVc9izvitoYjRKuq5vvzfx8O/nQ4TMVVlTcKw6tEDeIwbiBq7QMtG6XolPVRqGCZQEb/ZC+i/nkh/RlyuEVsSA2VPfVbRhF5FvjTRBLbda6/PZnxr5g6opWXDWS3VSddiUi10WhO487TqAXm2LXsgQvzXPy7xWwnp2AdJyFguGaFhuPMgkKQPVwwZQVt1m0XU28bZrJJ+GnRTt+Kf+pLHbGeVAqSq6ac9bZlsSTPYnKzBE3KO5Vv39kT8ExG8/QxLUDog/O3OAEj8CGnfnuWqCSYXmiHpy9xpfpuCYLtGbsa7bxF23kuC++242fv/3d+6W/SVN9sLhM3L9qcZ6LLRnPGdrTs/RCqG9R1z+SXcnBteH0sFfjBh6e54QFYwF3m68d5sOksxGgPWaHX56tBkNr+YVXUa/oJblEUJT/cfwcyzH0KZEFKBKXvea8nSIf6Gg+T8cFtMwKFvV3c8e1d29syrrfjxufzJLNjRnFk9r6eNOE6+etZfvztFlESa3ZRhdgm16XK5rtizss2B03WXjn4Qkjgrf5O2uiiXokAl1ng85NjDYyu8r2ekFj7Vq+5KnTuFWFScM4XXKiNxx0cYOD0T5cgoga77bdU5PJtb+n2VQzm4rG7XcM7x6BAnzHfYDvuWefaqpbmnHnO1inhpTau/b7zfoWSRoSWI8pjq+rlcN02XZHidBZD8sFy5hMrfPNedKuDP2rgjJ9pzcgi4Yn+ya9FZQTVAq6lvn4YO6ZE9YDQS6hTD2SuRc8hHyW0y40DZiHN/Yb8PoPwWtFqQwNLCnKl6VKZbindqCBRqX4E3RoH3vGPQPbojEDRUhNGR0ZYYKi1LtMgk1S9viG73cEboHzZm3uBXWF8whxE2X9/g9zB8U6b2SBn0MQtw5+vukcEX9M/fsbyXVFiP1jjomssJOC7B+l/ptY6YyE8oGyrLokZ4U+UBgUS/KkSVBkoR/24f350t3e8DTVs2+UtTXN2eGPvhwfp0EWwfiLhx89+GTvkT75Ln9vkNcQvYeIRg6v/bB5kIxvJq4zufn++kcH6ON4gcOMQNbOv08G9wEYwya89e4Nm7zpYRegmWp6SOtqradsDpHyRMb7oZdy09PUmVsTKZFljwfO12fUqiXzOcWhEvkXtIQzGgWmdrBAwhfrRH2idKYAirx5lSCPOQCS3LhVgG5uozK0GvKYGUUpY3VA5ov1l7BB+YHrn2lWYEIM/z0OHinc0wpRk3Ccj35FubB1Bqgk68Z+7U2sNG/BaktxaFApylggnh3sSbGXiuMJ1+2U7VNi2o0fO+zLWArvs+DTbwPhVJJWquo97pnfvPw4M6z/BXt3xeIzg4GdayBxnla3IYoZDJwFdcDs48gIDvJUXX9tQNRcYUFpaadVBOPCMG2hS+oFNQ2VrS7XSYMAyWnJ0pvPeXCu9q8hwyUp87Kk1DIfd/qD+WU4RxRzbPJ/YFeU+7FtBPLpEwocNfZrjVPOmxe6QV2+NtUYBtO7CXXRCW3IF23ouVLx0yNLM++B1U9pU79LWZqtvDlJvMiUQ+3bR8Hzne2t+ekQILi2daLpDemc+3wbss5/9BYREsk7MBgUnHBu+Z8HyEXAFta7fDwjaLmiQB4W5oeoCb6CfBUzRE1khkVK/TwaNCEFfcQf68LUNiu1IlN1AXMS2iiP/6S4CUMGtfI8xniOLtZghjgk6Zm+BA336Fy2KXO5BpFBlMHaPvAn0KfeVPiOhg6iOuZSnBshYqKHBu1Zno6Q3K8SJAJdnI3xM/tbpg9tnVSoHM15T2cacplZzWe0dvXI5SVJ5+q2PTG6rI8D2OUHU424I8a01MEWyY1yva/rj94/VclTQTsZwfmqGKJ9G9G6kEbEhF1lR0WcN1uWDSa/aPrtlxzkc8paMzNLkAtGxrs/dmEOaQ1LS1DRwRVID2pYKW+OV3DMyhm2HEVV4aqRZAnZvWiG4xCNzWK6BwdMI8azxqp2uPPuF2+/1h6lWz5nbvzqxaGUNc9cPnMGD95RtM0Vg3PFMNkG8da6+wct9/qbxrKMLsmpoDWOoshA2MEVw1ZLR3gAE3qIOOjKqlMnYiaK3/f/WQRO9MdmhZWoBop8tUa0Y4sheXszbxsO8XIb2a5ZiZMMavtmu3IRcx+an0z00ll7yo0WVYozYIVPhthW/JS3J4vy2OfHCiWz+bsMw71X0N6MhGhqk7so5rwXL10WW+KlFVNTxPmUk/WDGUcz9zgPWJKgF766ZYR3ijtC+el77Mtdoo4LTiu+KtnSs4djt0pG2a2gDby8NvDbDusE/7SWb98JfgtS7mj4pu4cJSngkO9FN9AzgvZtoj0r+ffJe/SdlRhMJqEITJFs7jutA1BrIHd0o8uYvJ66o3rcCPCXBS1Ahy3zzWCU2Qpggufd33BHN5Z4oXemkxGNgTxverakgMvqVmp1LssI4mePBpjpw13iroZZ2fMQCkASLtyCmncCZxEdH5FtlqSCYfjTPFt2rSxtvuMatit4VoS82CbgfUf3ooGPyU0nP1zT9AD8s9C3ulVGcHhfRhlRiFcAylELOUxl0turGm0tePza7O/ubyfsGUPZvlsYlfBCqB4UYkAqHoqOBPMC8Z0714OAtisSxRd8ftD02885UAbIkDYpuDYgZ3R2KCkcDMV5k0WQS/HAjnFYTBWSiI+xTussWtHsMdTfCbYCcap1f5A/5ozo0LV1mI1nagi/JwS85DxigzHSYxxI3iXCps/djRodPh34u1SziQBhbv0/fPndH6+XTUi5EyS8OMygim0RAo3DHFSIEymOdict6WBTOD6q0qWt3XNquxzm4Z2Z9ad1MEVRONwWKTx4p/XpZBRMt6h0adJCtsnObpPgYzMEDyZRRBHVnDr8IvWhqmpLZ584lSMroLrjAV6qJNPXiXjeLAowhuNtQP6zWZNwqGXgrNejYrExu/lnTugoUBt8Wmugq8Kc3mdMYwpB+chT88cIb5nkwkaQikLubHHujTney0mRfkEtbkRU/wq6R7yfDYhOjxKUYmOP7bGP2T3YMgRFfdzsz5+3dYfFKNps0ZbmLRAvCJufRlGvgNucZllxwYfMTpjNMjtlsncnllT9hmYCFWtWylBqzpbBCl8M/KhLhhe/RE7vIXktIaE5qO105QahVnJlotbcKLESWN+brqdV5iZ03HuopO/Cy/YnM4P6yMddP3gCg1mEkdVwI54kP5Un6lJ7Jj8xK6Y44bTuXHLHTYLECipLrs/O7bFcjZyRwT+JlTnPGprWUXZtecjy8PAXL0HkNS7VZluZIT0FpO97HuBiudfy2Ig0DwrHMnJyIWdlPxAWTvq2IGgvgmGDWnABBkpi1z16rMcS2UxXmtjLMkCTpRI4YolXWZJFKoYIpJH5yV9aK53NeRzrG9Abw/oOS8wYRqPLJNKFE4S9YAJIzU498rBG3f6LOVBY0lVYrV4yhxRmU9iEoasnYuZp0yVvxRdKmM4QphRdK3VA6wku8zW4LFhc6182RdFWrCaI59Po4Peu5oFTrhetpsFonufrJVQNQvlIiS3Z5wnAwS2hgnJo1TPgxnhwoK0mxS5XdlzlO4pOrhS7559/GLSHij4MoMzXA/W75wYjxNQPKTQnquErf2r2KGhVn+bm4XlzEun3bVI1K9Tdzz73f7B2W4aE6TpABiRAAoiTF4WI0v1CsE6QhfK7RQArQeSAh4j4J33R77906lHVzLPRZgs23/OzngrmgOUqEciDUZ9nZjB9WeiFPxkDtJsH2pK9JWeuXw+JjxADw3a727cEgN7OU1chdTk526OuLxRbxLykWZkEWJRvWYT6vuInC4aBlQz6eU256aAug9lznAQyyGM+CymjgTf5yLh8kVvc5T4PecurvijcSRfd7eYe7rka21dbh+vrRGd6pQN+vaIijGx8ppU4pNsQTpwgkUzyWUABDTzgNWPyhG50p3s95E2v+rhgIbaz7nZjD/RsFTXX2sGOdqJTvWZD18hKR/qXJlE/MiMNkrc5b5vqSJYUQsdgxWny93/hV/LfGXrIgpIEHeb3e66cb0b6/Xr2Y8yQHhlQUyLTAqZibbchQbIB0SqhuFJQMqTH0C0HCspvFwTwDHSdZ8Fz4HnwAnTOybyI7iVUizseDi6eIsVKsAwoV6FSlWo1VjCoVUfmtDNwwFnnnHfBRZestIqKmkaLVm0go3YdOml10dMxMetmaQm09OjVp5+NlZ2DMywug4YMG+F2xWiLocZjjJfPuAmTpnqORablyVegMDov5oZiJXFYlttuicI6T8IxZpQ31KpTr0FjeAZB3nNYsxYxUSdbEw7IT/lBYM4tAQQShIxNbGYLCMhKVrGG1axlHUeSIhUZkAAESHcf95tB9fzDoQdvIQHw7L1dwj+gb2KG47UO3o0FFuYBqqsP9OUtNB+xGSaNNeSI2k/US2LDJFKCY5a1VN+j806gaxRuxP4LxicsTL52p402WC+dUiY+6IhCBQ5YLUmXNrqwzGYaNnMIetssbURG8VlmbT1t5P7Ka70esFw2lg3ojqnb+EGVe2kIYc9cJtPKC+rnB7meOx7CkGznDc/Yz8q+waTLLMMuAoMUOiZfBh/JbRauIHucv2O8bEOj7xhzO3upzzmAvKIXXB2kDabYdK62zNHyl6uyef8KIs9DO3c9RuyT8T1uVLRPvYVwJYatIGDHgyEpjuBpz7SPJPuQ4Y55+BI9/2tlgLjkiXp7GroQ9vdM5nJNbzqowRV2Kd9Pc7Z0oVEPHHqmc7fDZviYHmbzn/Pn6VEAVAtDQ/l7HES7ugBgROsatFf9pUAEgS2NUFBKIzQAyr9hdUcMY7kpVs9/LqbyUY1IpDjJL3mNyCDzvIGBUBLkUjKhnihgvPBfKMelBOoJbG2FildTp42IxLoeyzti5Mmpf9+siE+bGBeaKzA2lc/ONfDrnxZCAQA=) format('woff2'); +} +@font-face{ + font-family: "dm"; + font-weight: bold; + font-style: normal; + unicode-range: U+1B00-218F; + src: local('☺'), + url(data:font/woff2;charset=utf-8;base64,d09GMk9UVE8AAAsQAAsAAAAAEVAAAArGAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAADZp/GyAcKgZgAIEsATYCJANCBAYFgXQHIBuKEACO0xV3IEmSe0c05/9ekrsEyk8lPUoDXK5mnpohngqaBNGKEVwimNYU07a0Dv5E6RObSze/5SH/9u5+lMBf4FlUW4YJhFESSNAWUeABBtb1+7W6D/coHi000caFQieU++/MG3g+tYRpKCbNZPo/krili6RKyB4akRoZMjmgIZ2t37uCGbUBgiAREYR4s6Pn2T1enl5znmmFY0+5mZuCuKkWiJvG46bzubECPNqCz2jRpX7RV40mYMNCGIkLbREy2z3agpeE/JGt88H+N42B3tFIHARlLrGZOH3u4uVrdt5I1e477+ru+My6/v4uCtePCRYeFfr+js4qpetRX3P/L5Hg6XrU7ZyC9W1SGdedChfMnU+qAJCdmu58asZDffgbv+G77vrKzzmf/ZkEjL/HgMgu9Pv3czBkLqsdWFaMNkJEXEPIz75hYAOIQAUK8Ad1mHcgJQT+HmcDOSM2hNIRI5+4IURsIgLPQUqXPfWZWE5k8kieNy+d18X7i7+O3y8wEywS7Ba8I8eR9VSJ8Jpoueiu2SazAnMr82DzTyPUI+6INX1Giz7i/u8c9Tv/LmdN4znNc+qOMxPnQNjPlGLG73AcfN9/++n39/OxFz4+e5qbLHV+Vlj/tOVnmC2FdGwBk/BqKjbH8smlM2JI7Cd+4rz4XD3k0FqNPqmQMblFUGqN8qIXw7lRYkjrIoyj4QfaZFzOGSmxsdx/DGSADMtgoUQrqYEM7h3tonO/U12lv627rAtJZ9Ka9e/i0vfkWKnUB9iwnaLrVyIvXZKKjbrS/pdhIUdmzv4yl0BLtG0BWm3Dv2kd42MSohKSpNWqKkdXN6WTKkGVEcJIasL3KTdcCRWJ8UIo2yUMgP3YBmzOIwBb0g1epERryHzxTWF7fFJsSnS66FFoyRF7Xz/7wNiQxAhtqNrvsfVSmKjfWhvhI14IKdRVB37m3tKuwq9Sd8dNG+qW2uIXn747Qq0+OCEULssvSvEqy6xQIY5irIMutWYGQBtL4CNuBKYImNGhF+KCddp6t8TEKHWVg6u7R3qo494xwKssJdrsQ4rxYrjbnBeekHdu/X8UlAnq+nOxWEOuxqjFDwfGQPX3kipuNPeKlsjBaQp2wrKZIDP5/Mn5wJRh4aJtahmAvQPESMIHlvOhQYbXTDArHHv8CR7SX/EUWLOjehnVWFJi8ITjBKgWxLfsoneWKzray8vV3IryXbs8FLsYcQjYEu/A5olkP1HkY70hvZa2M/jdv2cwiGI/g/0BrsHgvZZ2+nj3+/oIpcHO3s+DM/N6cM/D3m52cVgD7HhGFL6GaV16z975XKVxC23ywcNvTnG+8Ik07fz/ZHqwvfOboT3Nixfv3SeP07pzSBbdRZ8KMDQ3ZusePMpRyvcH+J2WRXK7M+dvWrxonzxY39zkJw9QGkC96QM9VK3jIsptOwevUtzOORCwG6L53Bawojd3e95U3dfdK7r5qseu02rrnjXLDiyvX9q99OVe0Yr8fwTIa6dftnQP1g8e6F8zsLXVbofVq1VFLjp7lZ2ny+bVIvEAt5zI44DP9XNRtKkRf3myn2vkhKQ4pAf2dP3eBeoe4s0v/D6IpPGSkp+mAWMNlmD23Z+5MkxkkPKI7YfXWa861ASEUgbnqYQ+EodTkZmazCxpqiY1KoWBgATot1d/CmhtY3evdUfN8Z3Zsn9LyGHPyd2YsMbSKdgc7zwnO0pdXk4+oSLDNGFh0hBNSFQI40BdXkGK4SdunKaGAGnbv21g18Y3GLX0jv9Mo95xo37w/S2gTtiOZWQL1Qy2JK55zx1YLfRc7b7Z77TPA7dX57pF4pAa0D15AaDxHwPsc7Budm4E8ya3Fkl3BOcXQvc33a675V123CCT9KDyM6esUvrsZNWWTXaKQ27aM5VqRjIcccLPS+EiPdzo1M2YbMGB5so54mi1qRh/IRO57fTd+NLEnFQRzn0OsEsY4HtGfeSCiNtjek0/DSr28ZV6nws+ftw7syCvMP9xFiPpQfJfSOflZ1eUGYL8lAFKtyDGOVulz5X2V+N13T/PfqvT6mtdu15rdU6ftbl/rsmK2c8Jt3Nyeuh5VetNv1KPLJnEpC3Lyi4uk7Zsrd5y+kTguUCm2pU0ZOQnFlrfqrjkfsZLdVB2/mgxJcFsU4naWeF14dRpdUqJt8z+LOlQXK1qt16BL+8FCg7hGTS2GvDOLCP5XJMNu6CS1zeAwL8Y5Hhh3YOGtULzokDkBPLcwSEbxJ1LJ7ee7u6qfdctr9/irb4UESpL7CBhcQNWUhKj3B/LyTaq2JVkpualF1nfz/M9dPys6oTCKyXfS+bkSCru3Q28Z43H4sP0IyeUg5PzTFsORvNoWEhLwAGbkwzj5TMAK3FE45iPb+EALJkRfumh+8radc5I/lN8o3q54HqjJQ2nqGD1pYtB/vu37N0YOUn0looabuqq78zR5+oz9aIaXCWMNFXRjxy5co53tJrEa0zZB0nb/lP2VjfIjRzkwzTL76gakOo7sjpzFhatrJwt+o56BvInIAdX6UwKz1VjqQcex5z9Sfku+DfRLCpwus96j7miszBOBVKYK51FYdcHWO5MwkC5cTx9VozPmvvJ8HFLjvmFy8QazqSB5WPuD3KTBiVVMM7ye6p6+D6MMPyum3YPj6icKfqeknjXwIG7nAScOzPfgmDc7I5nMEdghup96Fu9h5D1fusO42kiBUwPhWZYIJ1BSaqw8x2TZIQDnQV8nn/ZkMqxL7d3/NffGsfREA+TjwzhcLyKxNzue+PPrX45YP7VxsIwPLmi7fsD66D2cyB/5H5Y/5NLP3T+QMD+T3w4AK30xW3KjYdXihZzJ/rw38LHH/Qfkt6IYDsV30/iHVS0JjpamqZJi0xjYHO8314qRjNGGhmviY3z3UOqJilB+mP5neYHT73tSpiWJrLdY8fNpdZ41JbVeEKgJiAqUDaVurKcBD4VGhUSLJ0QgeGvdTqe8ZPjMJQNjwGzYUka2MIyGi8bFgnhqlBy7EcctvwpVNtK+4SroRSGh017Etdm0u2fJxSHZBh7M9dlYnUmhZ3ihVFr4fo1cCRThD01qRqzLvP6EV1x5EZvGSdZWNSnxibGxSflWfzPOH2sLULkQSMCQuZsMIh56+5DSETwceyOiJEnaqrpeFMTL45KT2SVE6FCQT6mYkT6UxQL3kuE0ImJiQstjCOI5BABsSRIWyMm2PELv4BMxL8Js/UIJUYg3sC34LFtcngG8TljfTzC+gukFTFGxR5MXK+wY1kpy7AT2ansfHYlu5nNY8smTJwwdYJdby2ZlZWwVqwNS3AetMLZ4HDyuQ78PjA88OvALwM/DvQPvBvoRGjArf/P/p7+B4jA4FmsWNuyYU1qfEqEDIMgdpuy/DIT7Ueu/lfEF1QhhFBPUGOzf+ULT2q/fv3yySxG8AER+Hj7twX/+QIoqpXncPtqIkwNZH1EJiG5WU6zCgr9gb5DFXQaRa1vCO1i7b5v0XiWrEw12SRfxTKzzHEO2gfJIRZCQifwEY0ZQigICscQBIJieHjCYvgQimgqyFdBkqbxqDKH38SLt0C+FNx5UGIVYy003wLLO0kIu1KWk0CrK+dB4ZwguLYJOTtLGS9V8TTXwgaco2qtPL8YjqsfuXLUpsjFXIrCJUfEE6mJFbx4Yi3gzRcBfKgAX7PQQggA) format('woff2'); +} +@font-face{ + font-family: "dm"; + font-weight: bold; + font-style: normal; + unicode-range: U+2190-21FF; + src: local('☺'), + url(data:font/woff2;charset=utf-8;base64,d09GMk9UVE8AAAUwAAsAAAAAB1AAAATmAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAADYgiGyAcKgZgADQBNgIkAxgEBgWBdAcgG4sGAI7TpTZgEnN0/bdZ/WbmEV2iJA6zYsqmkhMnIk5BTAcihkUMkpy4sKwZOZW/86+rntyLICrXkQ0QR08+dWrm0gQ092BRC9DqXtSrFmhMLHCpwL2epzzWACpGhKqRXuEpp/dUeAUCAgeEEGVSsaH2AaPBuPtMK1N7Z4K8GeQtgSBv5eRtvBwusJBAXv0Y9B08uGa1Cq8P9LGwZ+MB/BzxAMI3FN8NuvJ/Wyh+GAJKQvxDYjftPnI66b6Uwkdeeru4ocHYGjOaaq4brORYs5kiqzHz43HWWGZoNlQf2H/8SMsd5Y5Fjq9N5ffv2QcdWSrnbSvN237wz3wHOHNnXr9OX1r74QSeH2ZgOI82/g8dsxCCYs+IPs2/AICQIT8I4IAAB1odPoJ/yf3kNfI7t4Mr5p5UvokvJnrtWBYaZvbgd6pOqUvqipGme1xmdViRfcQxYrfgMKuIct9ip1BD5VM/LmsVqEUx6zZVWvExKy6G4kfesCLcgU4ViqilYY8tJ/4on/VBDTt1y82q0EpdZpOrO7ZL6uqUNKyMWanyTVnL6ry441JomBkvyeUqPMNEuqw9Lp9SsJNM474Lh7CChhXZbQ673e7rmpmedsbMdTk7nWpmwTKqHG+Rf2sl+M6PPL69oOp0OjvnYl2zrrkZDVah5S4307BTVE44vpzkw0TU3plFld9ji85rxnvIW/ilSjJJZilGmu6d7VE7hh0jjhF8gJ2Jct9k3L8UryoYt9V9Nz6AZxx2dKEjenZ6esoZ4zTNdM+qWTomZicht5Wy6wrk/s1epEpWu7RgRRPBj7z8P7JbtZiFgRhFV+72YYEYnZSJmayKSrMzkjPWOTU9O62xOUYddgemYHHUXW4WwKLp0t0+GMCibrlZChYl61cc0c6e3ikpVjKbuk0a5UU5nSAvZ/LYhBtUTWhEDzt34SL7DD9DfcvMr+zLJ+nggG0wtnKwv1JjG7IODsY8w76ing/9W87i5ffkDJXDxzZlm5iMmbJM9Y+pbRY60S/ZuzrK4sC+bJw5jPrH2Gc444IOPXiuiRl/lajyciAfvVs+o3qCGZkHPck6/Iy5mf5x8yH8somOjVvGYp9acDw7YZmxTGnq8UuvdqJHCOivGrIOWgbVnPNwTI9tYGxA3T/hGnH5pWwuxC+fMP/G9M34GbqTLzIP8zyBxiMuquyaW/phXjvPWuYVrGTcp2tO/smKGfO+zGCdtfp5/T0B3rGxsTeXNgYGemZHJ8fGp54KXLe0LTwegE6qgACAP8RRI0l33p8GviAAAKytAXjcqFtsww2m1Bc3GSDKJQ15gfsI40B5DR/I3QaAnOGSp+BAMmChxbPhsfVj6138s6sI+GeJ31mASQAA7gQfCABinAFuoPlAOBcivCyASAsUKa3/tbWaWuqnejdBqM1hrcZr7S0MOvWfLy+8BgDwfceFS/zsno/da2urf/s5hG+AAA+czwr+wiqy+etcSGvLPYAAfJ9wU6ATLdkByYKglFGJEkagAwEA4AM5eCCCHwB0ABkQEKADcMCBBHgAMHcL7ooUYmwhhZ/zExnVadegWqUqTUTPg+iAffY71jDJpONEJdqJdKpU09er6i1DqVpNqjQz2EOUQO+iM9RQoyZQztPlgZahUGYP0qZkBv24RtWMDET77bGvVngAHWsLcAAAAAA=) format('woff2'); +} +@font-face{ + font-family: "dm"; + font-weight: bold; + font-style: normal; + unicode-range: U+F8FF-10FFFF; + src: local('☺'), + url(data:font/woff2;charset=utf-8;base64,d09GMk9UVE8AAAJ0AAsAAAAAA/QAAAIqAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAADYILGyAcKgZgAAQBNgIkAwQEBgWBdAcgGzADAC4D7IZjzymg62laLb5im9YqKYd4pp0IH5Q4wfM11t7f3RNPmCSGpIkh4c2yaCYkPItas2reuIZYSNQTM2Ss3ZtaSFrNsPSi0pFIpbFOplQSozQTgtx2D1DYR6JuhAypGLvREA2A7uApPgY1OYgeC8Ayvq0tfgCOy3/OjT9vz2+AA909pwF9/n14lic0TDDgLsCov3cD7HJP9eTYIyKzTeFwCA8+E4xCRLzVnb12vN/reyL9TgUEkfJgglcieJcPbf0Qt29BkrLv8J7zzxfL3bbhyD1wjGfBvFlGfC+BrvfzhBHx1jVXVjwp4PWnrNv+wl6+cBzQyXu6yQ23capTbj9uoPHOagFvx3+WIwca9UT0znIXQESUt2/589nB/3tmaqS0NLrz8cJ7wfRIj9sRh9955/JPstfOQqZk3y3czExOze66PSH71ADLTBIBnGTKJtW1sRzoGIAEAGBYnnxqGre7c9KDHAkko875Ih2j3eoVNMnzyi47CgTI/R/Y5O+BdBB/hz4QRzmYAVQZiQ3kMgAIIPD/xF1r5/lKD3MM+Fw1wOfx8Ya/07h5I9AoAAT0Q/wvR2PC/4+GwJWJmuVBIwSw0URCjAPogY6CoQcVigHUwNDQ4DoL6WWVtc7zaH+wHDXrjYnekaVEkWKlS6TJfCtX11KvsdHs+BudH+X2pDHteZVAhURkrVZjS6jKUF2dVSteBQb3BfvWuNnv6RUrUMSpBo7ENgAAAA==) format('woff2'); +} diff --git a/docs/md_v2/assets/docs.zip b/docs/md_v2/assets/docs.zip new file mode 100644 index 0000000000000000000000000000000000000000..6b28c0a85e3db6904f7f6cfe71d3af24de7bc80f Binary files /dev/null and b/docs/md_v2/assets/docs.zip differ diff --git a/docs/md_v2/assets/highlight.css b/docs/md_v2/assets/highlight.css new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/md_v2/assets/highlight.min.js b/docs/md_v2/assets/highlight.min.js new file mode 100644 index 0000000000000000000000000000000000000000..f43ba9aa976d34fcdbc310832994e749f2580617 --- /dev/null +++ b/docs/md_v2/assets/highlight.min.js @@ -0,0 +1,1213 @@ +/*! + Highlight.js v11.9.0 (git: f47103d4f1) + (c) 2006-2023 undefined and other contributors + License: BSD-3-Clause + */ + var hljs=function(){"use strict";function e(n){ + return n instanceof Map?n.clear=n.delete=n.set=()=>{ + throw Error("map is read-only")}:n instanceof Set&&(n.add=n.clear=n.delete=()=>{ + throw Error("set is read-only") + }),Object.freeze(n),Object.getOwnPropertyNames(n).forEach((t=>{ + const a=n[t],i=typeof a;"object"!==i&&"function"!==i||Object.isFrozen(a)||e(a) + })),n}class n{constructor(e){ + void 0===e.data&&(e.data={}),this.data=e.data,this.isMatchIgnored=!1} + ignoreMatch(){this.isMatchIgnored=!0}}function t(e){ + return e.replace(/&/g,"&").replace(//g,">").replace(/"/g,""").replace(/'/g,"'") + }function a(e,...n){const t=Object.create(null);for(const n in e)t[n]=e[n] + ;return n.forEach((e=>{for(const n in e)t[n]=e[n]})),t}const i=e=>!!e.scope + ;class r{constructor(e,n){ + this.buffer="",this.classPrefix=n.classPrefix,e.walk(this)}addText(e){ + this.buffer+=t(e)}openNode(e){if(!i(e))return;const n=((e,{prefix:n})=>{ + if(e.startsWith("language:"))return e.replace("language:","language-") + ;if(e.includes(".")){const t=e.split(".") + ;return[`${n}${t.shift()}`,...t.map(((e,n)=>`${e}${"_".repeat(n+1)}`))].join(" ") + }return`${n}${e}`})(e.scope,{prefix:this.classPrefix});this.span(n)} + closeNode(e){i(e)&&(this.buffer+="
    ")}value(){return this.buffer}span(e){ + this.buffer+=``}}const s=(e={})=>{const n={children:[]} + ;return Object.assign(n,e),n};class o{constructor(){ + this.rootNode=s(),this.stack=[this.rootNode]}get top(){ + return this.stack[this.stack.length-1]}get root(){return this.rootNode}add(e){ + this.top.children.push(e)}openNode(e){const n=s({scope:e}) + ;this.add(n),this.stack.push(n)}closeNode(){ + if(this.stack.length>1)return this.stack.pop()}closeAllNodes(){ + for(;this.closeNode(););}toJSON(){return JSON.stringify(this.rootNode,null,4)} + walk(e){return this.constructor._walk(e,this.rootNode)}static _walk(e,n){ + return"string"==typeof n?e.addText(n):n.children&&(e.openNode(n), + n.children.forEach((n=>this._walk(e,n))),e.closeNode(n)),e}static _collapse(e){ + "string"!=typeof e&&e.children&&(e.children.every((e=>"string"==typeof e))?e.children=[e.children.join("")]:e.children.forEach((e=>{ + o._collapse(e)})))}}class l extends o{constructor(e){super(),this.options=e} + addText(e){""!==e&&this.add(e)}startScope(e){this.openNode(e)}endScope(){ + this.closeNode()}__addSublanguage(e,n){const t=e.root + ;n&&(t.scope="language:"+n),this.add(t)}toHTML(){ + return new r(this,this.options).value()}finalize(){ + return this.closeAllNodes(),!0}}function c(e){ + return e?"string"==typeof e?e:e.source:null}function d(e){return b("(?=",e,")")} + function g(e){return b("(?:",e,")*")}function u(e){return b("(?:",e,")?")} + function b(...e){return e.map((e=>c(e))).join("")}function m(...e){const n=(e=>{ + const n=e[e.length-1] + ;return"object"==typeof n&&n.constructor===Object?(e.splice(e.length-1,1),n):{} + })(e);return"("+(n.capture?"":"?:")+e.map((e=>c(e))).join("|")+")"} + function p(e){return RegExp(e.toString()+"|").exec("").length-1} + const _=/\[(?:[^\\\]]|\\.)*\]|\(\??|\\([1-9][0-9]*)|\\./ + ;function h(e,{joinWith:n}){let t=0;return e.map((e=>{t+=1;const n=t + ;let a=c(e),i="";for(;a.length>0;){const e=_.exec(a);if(!e){i+=a;break} + i+=a.substring(0,e.index), + a=a.substring(e.index+e[0].length),"\\"===e[0][0]&&e[1]?i+="\\"+(Number(e[1])+n):(i+=e[0], + "("===e[0]&&t++)}return i})).map((e=>`(${e})`)).join(n)} + const f="[a-zA-Z]\\w*",E="[a-zA-Z_]\\w*",y="\\b\\d+(\\.\\d+)?",N="(-?)(\\b0[xX][a-fA-F0-9]+|(\\b\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)",w="\\b(0b[01]+)",v={ + begin:"\\\\[\\s\\S]",relevance:0},O={scope:"string",begin:"'",end:"'", + illegal:"\\n",contains:[v]},k={scope:"string",begin:'"',end:'"',illegal:"\\n", + contains:[v]},x=(e,n,t={})=>{const i=a({scope:"comment",begin:e,end:n, + contains:[]},t);i.contains.push({scope:"doctag", + begin:"[ ]*(?=(TODO|FIXME|NOTE|BUG|OPTIMIZE|HACK|XXX):)", + end:/(TODO|FIXME|NOTE|BUG|OPTIMIZE|HACK|XXX):/,excludeBegin:!0,relevance:0}) + ;const r=m("I","a","is","so","us","to","at","if","in","it","on",/[A-Za-z]+['](d|ve|re|ll|t|s|n)/,/[A-Za-z]+[-][a-z]+/,/[A-Za-z][a-z]{2,}/) + ;return i.contains.push({begin:b(/[ ]+/,"(",r,/[.]?[:]?([.][ ]|[ ])/,"){3}")}),i + },M=x("//","$"),S=x("/\\*","\\*/"),A=x("#","$");var C=Object.freeze({ + __proto__:null,APOS_STRING_MODE:O,BACKSLASH_ESCAPE:v,BINARY_NUMBER_MODE:{ + scope:"number",begin:w,relevance:0},BINARY_NUMBER_RE:w,COMMENT:x, + C_BLOCK_COMMENT_MODE:S,C_LINE_COMMENT_MODE:M,C_NUMBER_MODE:{scope:"number", + begin:N,relevance:0},C_NUMBER_RE:N,END_SAME_AS_BEGIN:e=>Object.assign(e,{ + "on:begin":(e,n)=>{n.data._beginMatch=e[1]},"on:end":(e,n)=>{ + n.data._beginMatch!==e[1]&&n.ignoreMatch()}}),HASH_COMMENT_MODE:A,IDENT_RE:f, + MATCH_NOTHING_RE:/\b\B/,METHOD_GUARD:{begin:"\\.\\s*"+E,relevance:0}, + NUMBER_MODE:{scope:"number",begin:y,relevance:0},NUMBER_RE:y, + PHRASAL_WORDS_MODE:{ + begin:/\b(a|an|the|are|I'm|isn't|don't|doesn't|won't|but|just|should|pretty|simply|enough|gonna|going|wtf|so|such|will|you|your|they|like|more)\b/ + },QUOTE_STRING_MODE:k,REGEXP_MODE:{scope:"regexp",begin:/\/(?=[^/\n]*\/)/, + end:/\/[gimuy]*/,contains:[v,{begin:/\[/,end:/\]/,relevance:0,contains:[v]}]}, + RE_STARTERS_RE:"!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|-|-=|/=|/|:|;|<<|<<=|<=|<|===|==|=|>>>=|>>=|>=|>>>|>>|>|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~", + SHEBANG:(e={})=>{const n=/^#![ ]*\// + ;return e.binary&&(e.begin=b(n,/.*\b/,e.binary,/\b.*/)),a({scope:"meta",begin:n, + end:/$/,relevance:0,"on:begin":(e,n)=>{0!==e.index&&n.ignoreMatch()}},e)}, + TITLE_MODE:{scope:"title",begin:f,relevance:0},UNDERSCORE_IDENT_RE:E, + UNDERSCORE_TITLE_MODE:{scope:"title",begin:E,relevance:0}});function T(e,n){ + "."===e.input[e.index-1]&&n.ignoreMatch()}function R(e,n){ + void 0!==e.className&&(e.scope=e.className,delete e.className)}function D(e,n){ + n&&e.beginKeywords&&(e.begin="\\b("+e.beginKeywords.split(" ").join("|")+")(?!\\.)(?=\\b|\\s)", + e.__beforeBegin=T,e.keywords=e.keywords||e.beginKeywords,delete e.beginKeywords, + void 0===e.relevance&&(e.relevance=0))}function I(e,n){ + Array.isArray(e.illegal)&&(e.illegal=m(...e.illegal))}function L(e,n){ + if(e.match){ + if(e.begin||e.end)throw Error("begin & end are not supported with match") + ;e.begin=e.match,delete e.match}}function B(e,n){ + void 0===e.relevance&&(e.relevance=1)}const $=(e,n)=>{if(!e.beforeMatch)return + ;if(e.starts)throw Error("beforeMatch cannot be used with starts") + ;const t=Object.assign({},e);Object.keys(e).forEach((n=>{delete e[n] + })),e.keywords=t.keywords,e.begin=b(t.beforeMatch,d(t.begin)),e.starts={ + relevance:0,contains:[Object.assign(t,{endsParent:!0})] + },e.relevance=0,delete t.beforeMatch + },z=["of","and","for","in","not","or","if","then","parent","list","value"],F="keyword" + ;function U(e,n,t=F){const a=Object.create(null) + ;return"string"==typeof e?i(t,e.split(" ")):Array.isArray(e)?i(t,e):Object.keys(e).forEach((t=>{ + Object.assign(a,U(e[t],n,t))})),a;function i(e,t){ + n&&(t=t.map((e=>e.toLowerCase()))),t.forEach((n=>{const t=n.split("|") + ;a[t[0]]=[e,j(t[0],t[1])]}))}}function j(e,n){ + return n?Number(n):(e=>z.includes(e.toLowerCase()))(e)?0:1}const P={},K=e=>{ + console.error(e)},H=(e,...n)=>{console.log("WARN: "+e,...n)},q=(e,n)=>{ + P[`${e}/${n}`]||(console.log(`Deprecated as of ${e}. ${n}`),P[`${e}/${n}`]=!0) + },G=Error();function Z(e,n,{key:t}){let a=0;const i=e[t],r={},s={} + ;for(let e=1;e<=n.length;e++)s[e+a]=i[e],r[e+a]=!0,a+=p(n[e-1]) + ;e[t]=s,e[t]._emit=r,e[t]._multi=!0}function W(e){(e=>{ + e.scope&&"object"==typeof e.scope&&null!==e.scope&&(e.beginScope=e.scope, + delete e.scope)})(e),"string"==typeof e.beginScope&&(e.beginScope={ + _wrap:e.beginScope}),"string"==typeof e.endScope&&(e.endScope={_wrap:e.endScope + }),(e=>{if(Array.isArray(e.begin)){ + if(e.skip||e.excludeBegin||e.returnBegin)throw K("skip, excludeBegin, returnBegin not compatible with beginScope: {}"), + G + ;if("object"!=typeof e.beginScope||null===e.beginScope)throw K("beginScope must be object"), + G;Z(e,e.begin,{key:"beginScope"}),e.begin=h(e.begin,{joinWith:""})}})(e),(e=>{ + if(Array.isArray(e.end)){ + if(e.skip||e.excludeEnd||e.returnEnd)throw K("skip, excludeEnd, returnEnd not compatible with endScope: {}"), + G + ;if("object"!=typeof e.endScope||null===e.endScope)throw K("endScope must be object"), + G;Z(e,e.end,{key:"endScope"}),e.end=h(e.end,{joinWith:""})}})(e)}function Q(e){ + function n(n,t){ + return RegExp(c(n),"m"+(e.case_insensitive?"i":"")+(e.unicodeRegex?"u":"")+(t?"g":"")) + }class t{constructor(){ + this.matchIndexes={},this.regexes=[],this.matchAt=1,this.position=0} + addRule(e,n){ + n.position=this.position++,this.matchIndexes[this.matchAt]=n,this.regexes.push([n,e]), + this.matchAt+=p(e)+1}compile(){0===this.regexes.length&&(this.exec=()=>null) + ;const e=this.regexes.map((e=>e[1]));this.matcherRe=n(h(e,{joinWith:"|" + }),!0),this.lastIndex=0}exec(e){this.matcherRe.lastIndex=this.lastIndex + ;const n=this.matcherRe.exec(e);if(!n)return null + ;const t=n.findIndex(((e,n)=>n>0&&void 0!==e)),a=this.matchIndexes[t] + ;return n.splice(0,t),Object.assign(n,a)}}class i{constructor(){ + this.rules=[],this.multiRegexes=[], + this.count=0,this.lastIndex=0,this.regexIndex=0}getMatcher(e){ + if(this.multiRegexes[e])return this.multiRegexes[e];const n=new t + ;return this.rules.slice(e).forEach((([e,t])=>n.addRule(e,t))), + n.compile(),this.multiRegexes[e]=n,n}resumingScanAtSamePosition(){ + return 0!==this.regexIndex}considerAll(){this.regexIndex=0}addRule(e,n){ + this.rules.push([e,n]),"begin"===n.type&&this.count++}exec(e){ + const n=this.getMatcher(this.regexIndex);n.lastIndex=this.lastIndex + ;let t=n.exec(e) + ;if(this.resumingScanAtSamePosition())if(t&&t.index===this.lastIndex);else{ + const n=this.getMatcher(0);n.lastIndex=this.lastIndex+1,t=n.exec(e)} + return t&&(this.regexIndex+=t.position+1, + this.regexIndex===this.count&&this.considerAll()),t}} + if(e.compilerExtensions||(e.compilerExtensions=[]), + e.contains&&e.contains.includes("self"))throw Error("ERR: contains `self` is not supported at the top-level of a language. See documentation.") + ;return e.classNameAliases=a(e.classNameAliases||{}),function t(r,s){const o=r + ;if(r.isCompiled)return o + ;[R,L,W,$].forEach((e=>e(r,s))),e.compilerExtensions.forEach((e=>e(r,s))), + r.__beforeBegin=null,[D,I,B].forEach((e=>e(r,s))),r.isCompiled=!0;let l=null + ;return"object"==typeof r.keywords&&r.keywords.$pattern&&(r.keywords=Object.assign({},r.keywords), + l=r.keywords.$pattern, + delete r.keywords.$pattern),l=l||/\w+/,r.keywords&&(r.keywords=U(r.keywords,e.case_insensitive)), + o.keywordPatternRe=n(l,!0), + s&&(r.begin||(r.begin=/\B|\b/),o.beginRe=n(o.begin),r.end||r.endsWithParent||(r.end=/\B|\b/), + r.end&&(o.endRe=n(o.end)), + o.terminatorEnd=c(o.end)||"",r.endsWithParent&&s.terminatorEnd&&(o.terminatorEnd+=(r.end?"|":"")+s.terminatorEnd)), + r.illegal&&(o.illegalRe=n(r.illegal)), + r.contains||(r.contains=[]),r.contains=[].concat(...r.contains.map((e=>(e=>(e.variants&&!e.cachedVariants&&(e.cachedVariants=e.variants.map((n=>a(e,{ + variants:null},n)))),e.cachedVariants?e.cachedVariants:X(e)?a(e,{ + starts:e.starts?a(e.starts):null + }):Object.isFrozen(e)?a(e):e))("self"===e?r:e)))),r.contains.forEach((e=>{t(e,o) + })),r.starts&&t(r.starts,s),o.matcher=(e=>{const n=new i + ;return e.contains.forEach((e=>n.addRule(e.begin,{rule:e,type:"begin" + }))),e.terminatorEnd&&n.addRule(e.terminatorEnd,{type:"end" + }),e.illegal&&n.addRule(e.illegal,{type:"illegal"}),n})(o),o}(e)}function X(e){ + return!!e&&(e.endsWithParent||X(e.starts))}class V extends Error{ + constructor(e,n){super(e),this.name="HTMLInjectionError",this.html=n}} + const J=t,Y=a,ee=Symbol("nomatch"),ne=t=>{ + const a=Object.create(null),i=Object.create(null),r=[];let s=!0 + ;const o="Could not find the language '{}', did you forget to load/include a language module?",c={ + disableAutodetect:!0,name:"Plain text",contains:[]};let p={ + ignoreUnescapedHTML:!1,throwUnescapedHTML:!1,noHighlightRe:/^(no-?highlight)$/i, + languageDetectRe:/\blang(?:uage)?-([\w-]+)\b/i,classPrefix:"hljs-", + cssSelector:"pre code",languages:null,__emitter:l};function _(e){ + return p.noHighlightRe.test(e)}function h(e,n,t){let a="",i="" + ;"object"==typeof n?(a=e, + t=n.ignoreIllegals,i=n.language):(q("10.7.0","highlight(lang, code, ...args) has been deprecated."), + q("10.7.0","Please use highlight(code, options) instead.\nhttps://github.com/highlightjs/highlight.js/issues/2277"), + i=e,a=n),void 0===t&&(t=!0);const r={code:a,language:i};x("before:highlight",r) + ;const s=r.result?r.result:f(r.language,r.code,t) + ;return s.code=r.code,x("after:highlight",s),s}function f(e,t,i,r){ + const l=Object.create(null);function c(){if(!x.keywords)return void S.addText(A) + ;let e=0;x.keywordPatternRe.lastIndex=0;let n=x.keywordPatternRe.exec(A),t="" + ;for(;n;){t+=A.substring(e,n.index) + ;const i=w.case_insensitive?n[0].toLowerCase():n[0],r=(a=i,x.keywords[a]);if(r){ + const[e,a]=r + ;if(S.addText(t),t="",l[i]=(l[i]||0)+1,l[i]<=7&&(C+=a),e.startsWith("_"))t+=n[0];else{ + const t=w.classNameAliases[e]||e;g(n[0],t)}}else t+=n[0] + ;e=x.keywordPatternRe.lastIndex,n=x.keywordPatternRe.exec(A)}var a + ;t+=A.substring(e),S.addText(t)}function d(){null!=x.subLanguage?(()=>{ + if(""===A)return;let e=null;if("string"==typeof x.subLanguage){ + if(!a[x.subLanguage])return void S.addText(A) + ;e=f(x.subLanguage,A,!0,M[x.subLanguage]),M[x.subLanguage]=e._top + }else e=E(A,x.subLanguage.length?x.subLanguage:null) + ;x.relevance>0&&(C+=e.relevance),S.__addSublanguage(e._emitter,e.language) + })():c(),A=""}function g(e,n){ + ""!==e&&(S.startScope(n),S.addText(e),S.endScope())}function u(e,n){let t=1 + ;const a=n.length-1;for(;t<=a;){if(!e._emit[t]){t++;continue} + const a=w.classNameAliases[e[t]]||e[t],i=n[t];a?g(i,a):(A=i,c(),A=""),t++}} + function b(e,n){ + return e.scope&&"string"==typeof e.scope&&S.openNode(w.classNameAliases[e.scope]||e.scope), + e.beginScope&&(e.beginScope._wrap?(g(A,w.classNameAliases[e.beginScope._wrap]||e.beginScope._wrap), + A=""):e.beginScope._multi&&(u(e.beginScope,n),A="")),x=Object.create(e,{parent:{ + value:x}}),x}function m(e,t,a){let i=((e,n)=>{const t=e&&e.exec(n) + ;return t&&0===t.index})(e.endRe,a);if(i){if(e["on:end"]){const a=new n(e) + ;e["on:end"](t,a),a.isMatchIgnored&&(i=!1)}if(i){ + for(;e.endsParent&&e.parent;)e=e.parent;return e}} + if(e.endsWithParent)return m(e.parent,t,a)}function _(e){ + return 0===x.matcher.regexIndex?(A+=e[0],1):(D=!0,0)}function h(e){ + const n=e[0],a=t.substring(e.index),i=m(x,e,a);if(!i)return ee;const r=x + ;x.endScope&&x.endScope._wrap?(d(), + g(n,x.endScope._wrap)):x.endScope&&x.endScope._multi?(d(), + u(x.endScope,e)):r.skip?A+=n:(r.returnEnd||r.excludeEnd||(A+=n), + d(),r.excludeEnd&&(A=n));do{ + x.scope&&S.closeNode(),x.skip||x.subLanguage||(C+=x.relevance),x=x.parent + }while(x!==i.parent);return i.starts&&b(i.starts,e),r.returnEnd?0:n.length} + let y={};function N(a,r){const o=r&&r[0];if(A+=a,null==o)return d(),0 + ;if("begin"===y.type&&"end"===r.type&&y.index===r.index&&""===o){ + if(A+=t.slice(r.index,r.index+1),!s){const n=Error(`0 width match regex (${e})`) + ;throw n.languageName=e,n.badRule=y.rule,n}return 1} + if(y=r,"begin"===r.type)return(e=>{ + const t=e[0],a=e.rule,i=new n(a),r=[a.__beforeBegin,a["on:begin"]] + ;for(const n of r)if(n&&(n(e,i),i.isMatchIgnored))return _(t) + ;return a.skip?A+=t:(a.excludeBegin&&(A+=t), + d(),a.returnBegin||a.excludeBegin||(A=t)),b(a,e),a.returnBegin?0:t.length})(r) + ;if("illegal"===r.type&&!i){ + const e=Error('Illegal lexeme "'+o+'" for mode "'+(x.scope||"")+'"') + ;throw e.mode=x,e}if("end"===r.type){const e=h(r);if(e!==ee)return e} + if("illegal"===r.type&&""===o)return 1 + ;if(R>1e5&&R>3*r.index)throw Error("potential infinite loop, way more iterations than matches") + ;return A+=o,o.length}const w=v(e) + ;if(!w)throw K(o.replace("{}",e)),Error('Unknown language: "'+e+'"') + ;const O=Q(w);let k="",x=r||O;const M={},S=new p.__emitter(p);(()=>{const e=[] + ;for(let n=x;n!==w;n=n.parent)n.scope&&e.unshift(n.scope) + ;e.forEach((e=>S.openNode(e)))})();let A="",C=0,T=0,R=0,D=!1;try{ + if(w.__emitTokens)w.__emitTokens(t,S);else{for(x.matcher.considerAll();;){ + R++,D?D=!1:x.matcher.considerAll(),x.matcher.lastIndex=T + ;const e=x.matcher.exec(t);if(!e)break;const n=N(t.substring(T,e.index),e) + ;T=e.index+n}N(t.substring(T))}return S.finalize(),k=S.toHTML(),{language:e, + value:k,relevance:C,illegal:!1,_emitter:S,_top:x}}catch(n){ + if(n.message&&n.message.includes("Illegal"))return{language:e,value:J(t), + illegal:!0,relevance:0,_illegalBy:{message:n.message,index:T, + context:t.slice(T-100,T+100),mode:n.mode,resultSoFar:k},_emitter:S};if(s)return{ + language:e,value:J(t),illegal:!1,relevance:0,errorRaised:n,_emitter:S,_top:x} + ;throw n}}function E(e,n){n=n||p.languages||Object.keys(a);const t=(e=>{ + const n={value:J(e),illegal:!1,relevance:0,_top:c,_emitter:new p.__emitter(p)} + ;return n._emitter.addText(e),n})(e),i=n.filter(v).filter(k).map((n=>f(n,e,!1))) + ;i.unshift(t);const r=i.sort(((e,n)=>{ + if(e.relevance!==n.relevance)return n.relevance-e.relevance + ;if(e.language&&n.language){if(v(e.language).supersetOf===n.language)return 1 + ;if(v(n.language).supersetOf===e.language)return-1}return 0})),[s,o]=r,l=s + ;return l.secondBest=o,l}function y(e){let n=null;const t=(e=>{ + let n=e.className+" ";n+=e.parentNode?e.parentNode.className:"" + ;const t=p.languageDetectRe.exec(n);if(t){const n=v(t[1]) + ;return n||(H(o.replace("{}",t[1])), + H("Falling back to no-highlight mode for this block.",e)),n?t[1]:"no-highlight"} + return n.split(/\s+/).find((e=>_(e)||v(e)))})(e);if(_(t))return + ;if(x("before:highlightElement",{el:e,language:t + }),e.dataset.highlighted)return void console.log("Element previously highlighted. To highlight again, first unset `dataset.highlighted`.",e) + ;if(e.children.length>0&&(p.ignoreUnescapedHTML||(console.warn("One of your code blocks includes unescaped HTML. This is a potentially serious security risk."), + console.warn("https://github.com/highlightjs/highlight.js/wiki/security"), + console.warn("The element with unescaped HTML:"), + console.warn(e)),p.throwUnescapedHTML))throw new V("One of your code blocks includes unescaped HTML.",e.innerHTML) + ;n=e;const a=n.textContent,r=t?h(a,{language:t,ignoreIllegals:!0}):E(a) + ;e.innerHTML=r.value,e.dataset.highlighted="yes",((e,n,t)=>{const a=n&&i[n]||t + ;e.classList.add("hljs"),e.classList.add("language-"+a) + })(e,t,r.language),e.result={language:r.language,re:r.relevance, + relevance:r.relevance},r.secondBest&&(e.secondBest={ + language:r.secondBest.language,relevance:r.secondBest.relevance + }),x("after:highlightElement",{el:e,result:r,text:a})}let N=!1;function w(){ + "loading"!==document.readyState?document.querySelectorAll(p.cssSelector).forEach(y):N=!0 + }function v(e){return e=(e||"").toLowerCase(),a[e]||a[i[e]]} + function O(e,{languageName:n}){"string"==typeof e&&(e=[e]),e.forEach((e=>{ + i[e.toLowerCase()]=n}))}function k(e){const n=v(e) + ;return n&&!n.disableAutodetect}function x(e,n){const t=e;r.forEach((e=>{ + e[t]&&e[t](n)}))} + "undefined"!=typeof window&&window.addEventListener&&window.addEventListener("DOMContentLoaded",(()=>{ + N&&w()}),!1),Object.assign(t,{highlight:h,highlightAuto:E,highlightAll:w, + highlightElement:y, + highlightBlock:e=>(q("10.7.0","highlightBlock will be removed entirely in v12.0"), + q("10.7.0","Please use highlightElement now."),y(e)),configure:e=>{p=Y(p,e)}, + initHighlighting:()=>{ + w(),q("10.6.0","initHighlighting() deprecated. Use highlightAll() now.")}, + initHighlightingOnLoad:()=>{ + w(),q("10.6.0","initHighlightingOnLoad() deprecated. Use highlightAll() now.") + },registerLanguage:(e,n)=>{let i=null;try{i=n(t)}catch(n){ + if(K("Language definition for '{}' could not be registered.".replace("{}",e)), + !s)throw n;K(n),i=c} + i.name||(i.name=e),a[e]=i,i.rawDefinition=n.bind(null,t),i.aliases&&O(i.aliases,{ + languageName:e})},unregisterLanguage:e=>{delete a[e] + ;for(const n of Object.keys(i))i[n]===e&&delete i[n]}, + listLanguages:()=>Object.keys(a),getLanguage:v,registerAliases:O, + autoDetection:k,inherit:Y,addPlugin:e=>{(e=>{ + e["before:highlightBlock"]&&!e["before:highlightElement"]&&(e["before:highlightElement"]=n=>{ + e["before:highlightBlock"](Object.assign({block:n.el},n)) + }),e["after:highlightBlock"]&&!e["after:highlightElement"]&&(e["after:highlightElement"]=n=>{ + e["after:highlightBlock"](Object.assign({block:n.el},n))})})(e),r.push(e)}, + removePlugin:e=>{const n=r.indexOf(e);-1!==n&&r.splice(n,1)}}),t.debugMode=()=>{ + s=!1},t.safeMode=()=>{s=!0},t.versionString="11.9.0",t.regex={concat:b, + lookahead:d,either:m,optional:u,anyNumberOfTimes:g} + ;for(const n in C)"object"==typeof C[n]&&e(C[n]);return Object.assign(t,C),t + },te=ne({});te.newInstance=()=>ne({});var ae=te;const ie=e=>({IMPORTANT:{ + scope:"meta",begin:"!important"},BLOCK_COMMENT:e.C_BLOCK_COMMENT_MODE,HEXCOLOR:{ + scope:"number",begin:/#(([0-9a-fA-F]{3,4})|(([0-9a-fA-F]{2}){3,4}))\b/}, + FUNCTION_DISPATCH:{className:"built_in",begin:/[\w-]+(?=\()/}, + ATTRIBUTE_SELECTOR_MODE:{scope:"selector-attr",begin:/\[/,end:/\]/,illegal:"$", + contains:[e.APOS_STRING_MODE,e.QUOTE_STRING_MODE]},CSS_NUMBER_MODE:{ + scope:"number", + begin:e.NUMBER_RE+"(%|em|ex|ch|rem|vw|vh|vmin|vmax|cm|mm|in|pt|pc|px|deg|grad|rad|turn|s|ms|Hz|kHz|dpi|dpcm|dppx)?", + relevance:0},CSS_VARIABLE:{className:"attr",begin:/--[A-Za-z_][A-Za-z0-9_-]*/} + }),re=["a","abbr","address","article","aside","audio","b","blockquote","body","button","canvas","caption","cite","code","dd","del","details","dfn","div","dl","dt","em","fieldset","figcaption","figure","footer","form","h1","h2","h3","h4","h5","h6","header","hgroup","html","i","iframe","img","input","ins","kbd","label","legend","li","main","mark","menu","nav","object","ol","p","q","quote","samp","section","span","strong","summary","sup","table","tbody","td","textarea","tfoot","th","thead","time","tr","ul","var","video"],se=["any-hover","any-pointer","aspect-ratio","color","color-gamut","color-index","device-aspect-ratio","device-height","device-width","display-mode","forced-colors","grid","height","hover","inverted-colors","monochrome","orientation","overflow-block","overflow-inline","pointer","prefers-color-scheme","prefers-contrast","prefers-reduced-motion","prefers-reduced-transparency","resolution","scan","scripting","update","width","min-width","max-width","min-height","max-height"],oe=["active","any-link","blank","checked","current","default","defined","dir","disabled","drop","empty","enabled","first","first-child","first-of-type","fullscreen","future","focus","focus-visible","focus-within","has","host","host-context","hover","indeterminate","in-range","invalid","is","lang","last-child","last-of-type","left","link","local-link","not","nth-child","nth-col","nth-last-child","nth-last-col","nth-last-of-type","nth-of-type","only-child","only-of-type","optional","out-of-range","past","placeholder-shown","read-only","read-write","required","right","root","scope","target","target-within","user-invalid","valid","visited","where"],le=["after","backdrop","before","cue","cue-region","first-letter","first-line","grammar-error","marker","part","placeholder","selection","slotted","spelling-error"],ce=["align-content","align-items","align-self","all","animation","animation-delay","animation-direction","animation-duration","animation-fill-mode","animation-iteration-count","animation-name","animation-play-state","animation-timing-function","backface-visibility","background","background-attachment","background-blend-mode","background-clip","background-color","background-image","background-origin","background-position","background-repeat","background-size","block-size","border","border-block","border-block-color","border-block-end","border-block-end-color","border-block-end-style","border-block-end-width","border-block-start","border-block-start-color","border-block-start-style","border-block-start-width","border-block-style","border-block-width","border-bottom","border-bottom-color","border-bottom-left-radius","border-bottom-right-radius","border-bottom-style","border-bottom-width","border-collapse","border-color","border-image","border-image-outset","border-image-repeat","border-image-slice","border-image-source","border-image-width","border-inline","border-inline-color","border-inline-end","border-inline-end-color","border-inline-end-style","border-inline-end-width","border-inline-start","border-inline-start-color","border-inline-start-style","border-inline-start-width","border-inline-style","border-inline-width","border-left","border-left-color","border-left-style","border-left-width","border-radius","border-right","border-right-color","border-right-style","border-right-width","border-spacing","border-style","border-top","border-top-color","border-top-left-radius","border-top-right-radius","border-top-style","border-top-width","border-width","bottom","box-decoration-break","box-shadow","box-sizing","break-after","break-before","break-inside","caption-side","caret-color","clear","clip","clip-path","clip-rule","color","column-count","column-fill","column-gap","column-rule","column-rule-color","column-rule-style","column-rule-width","column-span","column-width","columns","contain","content","content-visibility","counter-increment","counter-reset","cue","cue-after","cue-before","cursor","direction","display","empty-cells","filter","flex","flex-basis","flex-direction","flex-flow","flex-grow","flex-shrink","flex-wrap","float","flow","font","font-display","font-family","font-feature-settings","font-kerning","font-language-override","font-size","font-size-adjust","font-smoothing","font-stretch","font-style","font-synthesis","font-variant","font-variant-caps","font-variant-east-asian","font-variant-ligatures","font-variant-numeric","font-variant-position","font-variation-settings","font-weight","gap","glyph-orientation-vertical","grid","grid-area","grid-auto-columns","grid-auto-flow","grid-auto-rows","grid-column","grid-column-end","grid-column-start","grid-gap","grid-row","grid-row-end","grid-row-start","grid-template","grid-template-areas","grid-template-columns","grid-template-rows","hanging-punctuation","height","hyphens","icon","image-orientation","image-rendering","image-resolution","ime-mode","inline-size","isolation","justify-content","left","letter-spacing","line-break","line-height","list-style","list-style-image","list-style-position","list-style-type","margin","margin-block","margin-block-end","margin-block-start","margin-bottom","margin-inline","margin-inline-end","margin-inline-start","margin-left","margin-right","margin-top","marks","mask","mask-border","mask-border-mode","mask-border-outset","mask-border-repeat","mask-border-slice","mask-border-source","mask-border-width","mask-clip","mask-composite","mask-image","mask-mode","mask-origin","mask-position","mask-repeat","mask-size","mask-type","max-block-size","max-height","max-inline-size","max-width","min-block-size","min-height","min-inline-size","min-width","mix-blend-mode","nav-down","nav-index","nav-left","nav-right","nav-up","none","normal","object-fit","object-position","opacity","order","orphans","outline","outline-color","outline-offset","outline-style","outline-width","overflow","overflow-wrap","overflow-x","overflow-y","padding","padding-block","padding-block-end","padding-block-start","padding-bottom","padding-inline","padding-inline-end","padding-inline-start","padding-left","padding-right","padding-top","page-break-after","page-break-before","page-break-inside","pause","pause-after","pause-before","perspective","perspective-origin","pointer-events","position","quotes","resize","rest","rest-after","rest-before","right","row-gap","scroll-margin","scroll-margin-block","scroll-margin-block-end","scroll-margin-block-start","scroll-margin-bottom","scroll-margin-inline","scroll-margin-inline-end","scroll-margin-inline-start","scroll-margin-left","scroll-margin-right","scroll-margin-top","scroll-padding","scroll-padding-block","scroll-padding-block-end","scroll-padding-block-start","scroll-padding-bottom","scroll-padding-inline","scroll-padding-inline-end","scroll-padding-inline-start","scroll-padding-left","scroll-padding-right","scroll-padding-top","scroll-snap-align","scroll-snap-stop","scroll-snap-type","scrollbar-color","scrollbar-gutter","scrollbar-width","shape-image-threshold","shape-margin","shape-outside","speak","speak-as","src","tab-size","table-layout","text-align","text-align-all","text-align-last","text-combine-upright","text-decoration","text-decoration-color","text-decoration-line","text-decoration-style","text-emphasis","text-emphasis-color","text-emphasis-position","text-emphasis-style","text-indent","text-justify","text-orientation","text-overflow","text-rendering","text-shadow","text-transform","text-underline-position","top","transform","transform-box","transform-origin","transform-style","transition","transition-delay","transition-duration","transition-property","transition-timing-function","unicode-bidi","vertical-align","visibility","voice-balance","voice-duration","voice-family","voice-pitch","voice-range","voice-rate","voice-stress","voice-volume","white-space","widows","width","will-change","word-break","word-spacing","word-wrap","writing-mode","z-index"].reverse(),de=oe.concat(le) + ;var ge="[0-9](_*[0-9])*",ue=`\\.(${ge})`,be="[0-9a-fA-F](_*[0-9a-fA-F])*",me={ + className:"number",variants:[{ + begin:`(\\b(${ge})((${ue})|\\.)?|(${ue}))[eE][+-]?(${ge})[fFdD]?\\b`},{ + begin:`\\b(${ge})((${ue})[fFdD]?\\b|\\.([fFdD]\\b)?)`},{ + begin:`(${ue})[fFdD]?\\b`},{begin:`\\b(${ge})[fFdD]\\b`},{ + begin:`\\b0[xX]((${be})\\.?|(${be})?\\.(${be}))[pP][+-]?(${ge})[fFdD]?\\b`},{ + begin:"\\b(0|[1-9](_*[0-9])*)[lL]?\\b"},{begin:`\\b0[xX](${be})[lL]?\\b`},{ + begin:"\\b0(_*[0-7])*[lL]?\\b"},{begin:"\\b0[bB][01](_*[01])*[lL]?\\b"}], + relevance:0};function pe(e,n,t){return-1===t?"":e.replace(n,(a=>pe(e,n,t-1)))} + const _e="[A-Za-z$_][0-9A-Za-z$_]*",he=["as","in","of","if","for","while","finally","var","new","function","do","return","void","else","break","catch","instanceof","with","throw","case","default","try","switch","continue","typeof","delete","let","yield","const","class","debugger","async","await","static","import","from","export","extends"],fe=["true","false","null","undefined","NaN","Infinity"],Ee=["Object","Function","Boolean","Symbol","Math","Date","Number","BigInt","String","RegExp","Array","Float32Array","Float64Array","Int8Array","Uint8Array","Uint8ClampedArray","Int16Array","Int32Array","Uint16Array","Uint32Array","BigInt64Array","BigUint64Array","Set","Map","WeakSet","WeakMap","ArrayBuffer","SharedArrayBuffer","Atomics","DataView","JSON","Promise","Generator","GeneratorFunction","AsyncFunction","Reflect","Proxy","Intl","WebAssembly"],ye=["Error","EvalError","InternalError","RangeError","ReferenceError","SyntaxError","TypeError","URIError"],Ne=["setInterval","setTimeout","clearInterval","clearTimeout","require","exports","eval","isFinite","isNaN","parseFloat","parseInt","decodeURI","decodeURIComponent","encodeURI","encodeURIComponent","escape","unescape"],we=["arguments","this","super","console","window","document","localStorage","sessionStorage","module","global"],ve=[].concat(Ne,Ee,ye) + ;function Oe(e){const n=e.regex,t=_e,a={begin:/<[A-Za-z0-9\\._:-]+/, + end:/\/[A-Za-z0-9\\._:-]+>|\/>/,isTrulyOpeningTag:(e,n)=>{ + const t=e[0].length+e.index,a=e.input[t] + ;if("<"===a||","===a)return void n.ignoreMatch();let i + ;">"===a&&(((e,{after:n})=>{const t="",M={ + match:[/const|var|let/,/\s+/,t,/\s*/,/=\s*/,/(async\s*)?/,n.lookahead(x)], + keywords:"async",className:{1:"keyword",3:"title.function"},contains:[f]} + ;return{name:"JavaScript",aliases:["js","jsx","mjs","cjs"],keywords:i,exports:{ + PARAMS_CONTAINS:h,CLASS_REFERENCE:y},illegal:/#(?![$_A-z])/, + contains:[e.SHEBANG({label:"shebang",binary:"node",relevance:5}),{ + label:"use_strict",className:"meta",relevance:10, + begin:/^\s*['"]use (strict|asm)['"]/ + },e.APOS_STRING_MODE,e.QUOTE_STRING_MODE,d,g,u,b,m,{match:/\$\d+/},l,y,{ + className:"attr",begin:t+n.lookahead(":"),relevance:0},M,{ + begin:"("+e.RE_STARTERS_RE+"|\\b(case|return|throw)\\b)\\s*", + keywords:"return throw case",relevance:0,contains:[m,e.REGEXP_MODE,{ + className:"function",begin:x,returnBegin:!0,end:"\\s*=>",contains:[{ + className:"params",variants:[{begin:e.UNDERSCORE_IDENT_RE,relevance:0},{ + className:null,begin:/\(\s*\)/,skip:!0},{begin:/\(/,end:/\)/,excludeBegin:!0, + excludeEnd:!0,keywords:i,contains:h}]}]},{begin:/,/,relevance:0},{match:/\s+/, + relevance:0},{variants:[{begin:"<>",end:""},{ + match:/<[A-Za-z0-9\\._:-]+\s*\/>/},{begin:a.begin, + "on:begin":a.isTrulyOpeningTag,end:a.end}],subLanguage:"xml",contains:[{ + begin:a.begin,end:a.end,skip:!0,contains:["self"]}]}]},N,{ + beginKeywords:"while if switch catch for"},{ + begin:"\\b(?!function)"+e.UNDERSCORE_IDENT_RE+"\\([^()]*(\\([^()]*(\\([^()]*\\)[^()]*)*\\)[^()]*)*\\)\\s*\\{", + returnBegin:!0,label:"func.def",contains:[f,e.inherit(e.TITLE_MODE,{begin:t, + className:"title.function"})]},{match:/\.\.\./,relevance:0},O,{match:"\\$"+t, + relevance:0},{match:[/\bconstructor(?=\s*\()/],className:{1:"title.function"}, + contains:[f]},w,{relevance:0,match:/\b[A-Z][A-Z_0-9]+\b/, + className:"variable.constant"},E,k,{match:/\$[(.]/}]}} + const ke=e=>b(/\b/,e,/\w$/.test(e)?/\b/:/\B/),xe=["Protocol","Type"].map(ke),Me=["init","self"].map(ke),Se=["Any","Self"],Ae=["actor","any","associatedtype","async","await",/as\?/,/as!/,"as","borrowing","break","case","catch","class","consume","consuming","continue","convenience","copy","default","defer","deinit","didSet","distributed","do","dynamic","each","else","enum","extension","fallthrough",/fileprivate\(set\)/,"fileprivate","final","for","func","get","guard","if","import","indirect","infix",/init\?/,/init!/,"inout",/internal\(set\)/,"internal","in","is","isolated","nonisolated","lazy","let","macro","mutating","nonmutating",/open\(set\)/,"open","operator","optional","override","postfix","precedencegroup","prefix",/private\(set\)/,"private","protocol",/public\(set\)/,"public","repeat","required","rethrows","return","set","some","static","struct","subscript","super","switch","throws","throw",/try\?/,/try!/,"try","typealias",/unowned\(safe\)/,/unowned\(unsafe\)/,"unowned","var","weak","where","while","willSet"],Ce=["false","nil","true"],Te=["assignment","associativity","higherThan","left","lowerThan","none","right"],Re=["#colorLiteral","#column","#dsohandle","#else","#elseif","#endif","#error","#file","#fileID","#fileLiteral","#filePath","#function","#if","#imageLiteral","#keyPath","#line","#selector","#sourceLocation","#warning"],De=["abs","all","any","assert","assertionFailure","debugPrint","dump","fatalError","getVaList","isKnownUniquelyReferenced","max","min","numericCast","pointwiseMax","pointwiseMin","precondition","preconditionFailure","print","readLine","repeatElement","sequence","stride","swap","swift_unboxFromSwiftValueWithType","transcode","type","unsafeBitCast","unsafeDowncast","withExtendedLifetime","withUnsafeMutablePointer","withUnsafePointer","withVaList","withoutActuallyEscaping","zip"],Ie=m(/[/=\-+!*%<>&|^~?]/,/[\u00A1-\u00A7]/,/[\u00A9\u00AB]/,/[\u00AC\u00AE]/,/[\u00B0\u00B1]/,/[\u00B6\u00BB\u00BF\u00D7\u00F7]/,/[\u2016-\u2017]/,/[\u2020-\u2027]/,/[\u2030-\u203E]/,/[\u2041-\u2053]/,/[\u2055-\u205E]/,/[\u2190-\u23FF]/,/[\u2500-\u2775]/,/[\u2794-\u2BFF]/,/[\u2E00-\u2E7F]/,/[\u3001-\u3003]/,/[\u3008-\u3020]/,/[\u3030]/),Le=m(Ie,/[\u0300-\u036F]/,/[\u1DC0-\u1DFF]/,/[\u20D0-\u20FF]/,/[\uFE00-\uFE0F]/,/[\uFE20-\uFE2F]/),Be=b(Ie,Le,"*"),$e=m(/[a-zA-Z_]/,/[\u00A8\u00AA\u00AD\u00AF\u00B2-\u00B5\u00B7-\u00BA]/,/[\u00BC-\u00BE\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF]/,/[\u0100-\u02FF\u0370-\u167F\u1681-\u180D\u180F-\u1DBF]/,/[\u1E00-\u1FFF]/,/[\u200B-\u200D\u202A-\u202E\u203F-\u2040\u2054\u2060-\u206F]/,/[\u2070-\u20CF\u2100-\u218F\u2460-\u24FF\u2776-\u2793]/,/[\u2C00-\u2DFF\u2E80-\u2FFF]/,/[\u3004-\u3007\u3021-\u302F\u3031-\u303F\u3040-\uD7FF]/,/[\uF900-\uFD3D\uFD40-\uFDCF\uFDF0-\uFE1F\uFE30-\uFE44]/,/[\uFE47-\uFEFE\uFF00-\uFFFD]/),ze=m($e,/\d/,/[\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]/),Fe=b($e,ze,"*"),Ue=b(/[A-Z]/,ze,"*"),je=["attached","autoclosure",b(/convention\(/,m("swift","block","c"),/\)/),"discardableResult","dynamicCallable","dynamicMemberLookup","escaping","freestanding","frozen","GKInspectable","IBAction","IBDesignable","IBInspectable","IBOutlet","IBSegueAction","inlinable","main","nonobjc","NSApplicationMain","NSCopying","NSManaged",b(/objc\(/,Fe,/\)/),"objc","objcMembers","propertyWrapper","requires_stored_property_inits","resultBuilder","Sendable","testable","UIApplicationMain","unchecked","unknown","usableFromInline","warn_unqualified_access"],Pe=["iOS","iOSApplicationExtension","macOS","macOSApplicationExtension","macCatalyst","macCatalystApplicationExtension","watchOS","watchOSApplicationExtension","tvOS","tvOSApplicationExtension","swift"] + ;var Ke=Object.freeze({__proto__:null,grmr_bash:e=>{const n=e.regex,t={},a={ + begin:/\$\{/,end:/\}/,contains:["self",{begin:/:-/,contains:[t]}]} + ;Object.assign(t,{className:"variable",variants:[{ + begin:n.concat(/\$[\w\d#@][\w\d_]*/,"(?![\\w\\d])(?![$])")},a]});const i={ + className:"subst",begin:/\$\(/,end:/\)/,contains:[e.BACKSLASH_ESCAPE]},r={ + begin:/<<-?\s*(?=\w+)/,starts:{contains:[e.END_SAME_AS_BEGIN({begin:/(\w+)/, + end:/(\w+)/,className:"string"})]}},s={className:"string",begin:/"/,end:/"/, + contains:[e.BACKSLASH_ESCAPE,t,i]};i.contains.push(s);const o={begin:/\$?\(\(/, + end:/\)\)/,contains:[{begin:/\d+#[0-9a-f]+/,className:"number"},e.NUMBER_MODE,t] + },l=e.SHEBANG({binary:"(fish|bash|zsh|sh|csh|ksh|tcsh|dash|scsh)",relevance:10 + }),c={className:"function",begin:/\w[\w\d_]*\s*\(\s*\)\s*\{/,returnBegin:!0, + contains:[e.inherit(e.TITLE_MODE,{begin:/\w[\w\d_]*/})],relevance:0};return{ + name:"Bash",aliases:["sh"],keywords:{$pattern:/\b[a-z][a-z0-9._-]+\b/, + keyword:["if","then","else","elif","fi","for","while","until","in","do","done","case","esac","function","select"], + literal:["true","false"], + built_in:["break","cd","continue","eval","exec","exit","export","getopts","hash","pwd","readonly","return","shift","test","times","trap","umask","unset","alias","bind","builtin","caller","command","declare","echo","enable","help","let","local","logout","mapfile","printf","read","readarray","source","type","typeset","ulimit","unalias","set","shopt","autoload","bg","bindkey","bye","cap","chdir","clone","comparguments","compcall","compctl","compdescribe","compfiles","compgroups","compquote","comptags","comptry","compvalues","dirs","disable","disown","echotc","echoti","emulate","fc","fg","float","functions","getcap","getln","history","integer","jobs","kill","limit","log","noglob","popd","print","pushd","pushln","rehash","sched","setcap","setopt","stat","suspend","ttyctl","unfunction","unhash","unlimit","unsetopt","vared","wait","whence","where","which","zcompile","zformat","zftp","zle","zmodload","zparseopts","zprof","zpty","zregexparse","zsocket","zstyle","ztcp","chcon","chgrp","chown","chmod","cp","dd","df","dir","dircolors","ln","ls","mkdir","mkfifo","mknod","mktemp","mv","realpath","rm","rmdir","shred","sync","touch","truncate","vdir","b2sum","base32","base64","cat","cksum","comm","csplit","cut","expand","fmt","fold","head","join","md5sum","nl","numfmt","od","paste","ptx","pr","sha1sum","sha224sum","sha256sum","sha384sum","sha512sum","shuf","sort","split","sum","tac","tail","tr","tsort","unexpand","uniq","wc","arch","basename","chroot","date","dirname","du","echo","env","expr","factor","groups","hostid","id","link","logname","nice","nohup","nproc","pathchk","pinky","printenv","printf","pwd","readlink","runcon","seq","sleep","stat","stdbuf","stty","tee","test","timeout","tty","uname","unlink","uptime","users","who","whoami","yes"] + },contains:[l,e.SHEBANG(),c,o,e.HASH_COMMENT_MODE,r,{match:/(\/[a-z._-]+)+/},s,{ + match:/\\"/},{className:"string",begin:/'/,end:/'/},{match:/\\'/},t]}}, + grmr_c:e=>{const n=e.regex,t=e.COMMENT("//","$",{contains:[{begin:/\\\n/}] + }),a="decltype\\(auto\\)",i="[a-zA-Z_]\\w*::",r="("+a+"|"+n.optional(i)+"[a-zA-Z_]\\w*"+n.optional("<[^<>]+>")+")",s={ + className:"type",variants:[{begin:"\\b[a-z\\d_]*_t\\b"},{ + match:/\batomic_[a-z]{3,6}\b/}]},o={className:"string",variants:[{ + begin:'(u8?|U|L)?"',end:'"',illegal:"\\n",contains:[e.BACKSLASH_ESCAPE]},{ + begin:"(u8?|U|L)?'(\\\\(x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4,8}|[0-7]{3}|\\S)|.)", + end:"'",illegal:"."},e.END_SAME_AS_BEGIN({ + begin:/(?:u8?|U|L)?R"([^()\\ ]{0,16})\(/,end:/\)([^()\\ ]{0,16})"/})]},l={ + className:"number",variants:[{begin:"\\b(0b[01']+)"},{ + begin:"(-?)\\b([\\d']+(\\.[\\d']*)?|\\.[\\d']+)((ll|LL|l|L)(u|U)?|(u|U)(ll|LL|l|L)?|f|F|b|B)" + },{ + begin:"(-?)(\\b0[xX][a-fA-F0-9']+|(\\b[\\d']+(\\.[\\d']*)?|\\.[\\d']+)([eE][-+]?[\\d']+)?)" + }],relevance:0},c={className:"meta",begin:/#\s*[a-z]+\b/,end:/$/,keywords:{ + keyword:"if else elif endif define undef warning error line pragma _Pragma ifdef ifndef include" + },contains:[{begin:/\\\n/,relevance:0},e.inherit(o,{className:"string"}),{ + className:"string",begin:/<.*?>/},t,e.C_BLOCK_COMMENT_MODE]},d={ + className:"title",begin:n.optional(i)+e.IDENT_RE,relevance:0 + },g=n.optional(i)+e.IDENT_RE+"\\s*\\(",u={ + keyword:["asm","auto","break","case","continue","default","do","else","enum","extern","for","fortran","goto","if","inline","register","restrict","return","sizeof","struct","switch","typedef","union","volatile","while","_Alignas","_Alignof","_Atomic","_Generic","_Noreturn","_Static_assert","_Thread_local","alignas","alignof","noreturn","static_assert","thread_local","_Pragma"], + type:["float","double","signed","unsigned","int","short","long","char","void","_Bool","_Complex","_Imaginary","_Decimal32","_Decimal64","_Decimal128","const","static","complex","bool","imaginary"], + literal:"true false NULL", + built_in:"std string wstring cin cout cerr clog stdin stdout stderr stringstream istringstream ostringstream auto_ptr deque list queue stack vector map set pair bitset multiset multimap unordered_set unordered_map unordered_multiset unordered_multimap priority_queue make_pair array shared_ptr abort terminate abs acos asin atan2 atan calloc ceil cosh cos exit exp fabs floor fmod fprintf fputs free frexp fscanf future isalnum isalpha iscntrl isdigit isgraph islower isprint ispunct isspace isupper isxdigit tolower toupper labs ldexp log10 log malloc realloc memchr memcmp memcpy memset modf pow printf putchar puts scanf sinh sin snprintf sprintf sqrt sscanf strcat strchr strcmp strcpy strcspn strlen strncat strncmp strncpy strpbrk strrchr strspn strstr tanh tan vfprintf vprintf vsprintf endl initializer_list unique_ptr" + },b=[c,s,t,e.C_BLOCK_COMMENT_MODE,l,o],m={variants:[{begin:/=/,end:/;/},{ + begin:/\(/,end:/\)/},{beginKeywords:"new throw return else",end:/;/}], + keywords:u,contains:b.concat([{begin:/\(/,end:/\)/,keywords:u, + contains:b.concat(["self"]),relevance:0}]),relevance:0},p={ + begin:"("+r+"[\\*&\\s]+)+"+g,returnBegin:!0,end:/[{;=]/,excludeEnd:!0, + keywords:u,illegal:/[^\w\s\*&:<>.]/,contains:[{begin:a,keywords:u,relevance:0},{ + begin:g,returnBegin:!0,contains:[e.inherit(d,{className:"title.function"})], + relevance:0},{relevance:0,match:/,/},{className:"params",begin:/\(/,end:/\)/, + keywords:u,relevance:0,contains:[t,e.C_BLOCK_COMMENT_MODE,o,l,s,{begin:/\(/, + end:/\)/,keywords:u,relevance:0,contains:["self",t,e.C_BLOCK_COMMENT_MODE,o,l,s] + }]},s,t,e.C_BLOCK_COMMENT_MODE,c]};return{name:"C",aliases:["h"],keywords:u, + disableAutodetect:!0,illegal:"=]/,contains:[{ + beginKeywords:"final class struct"},e.TITLE_MODE]}]),exports:{preprocessor:c, + strings:o,keywords:u}}},grmr_cpp:e=>{const n=e.regex,t=e.COMMENT("//","$",{ + contains:[{begin:/\\\n/}] + }),a="decltype\\(auto\\)",i="[a-zA-Z_]\\w*::",r="(?!struct)("+a+"|"+n.optional(i)+"[a-zA-Z_]\\w*"+n.optional("<[^<>]+>")+")",s={ + className:"type",begin:"\\b[a-z\\d_]*_t\\b"},o={className:"string",variants:[{ + begin:'(u8?|U|L)?"',end:'"',illegal:"\\n",contains:[e.BACKSLASH_ESCAPE]},{ + begin:"(u8?|U|L)?'(\\\\(x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4,8}|[0-7]{3}|\\S)|.)", + end:"'",illegal:"."},e.END_SAME_AS_BEGIN({ + begin:/(?:u8?|U|L)?R"([^()\\ ]{0,16})\(/,end:/\)([^()\\ ]{0,16})"/})]},l={ + className:"number",variants:[{begin:"\\b(0b[01']+)"},{ + begin:"(-?)\\b([\\d']+(\\.[\\d']*)?|\\.[\\d']+)((ll|LL|l|L)(u|U)?|(u|U)(ll|LL|l|L)?|f|F|b|B)" + },{ + begin:"(-?)(\\b0[xX][a-fA-F0-9']+|(\\b[\\d']+(\\.[\\d']*)?|\\.[\\d']+)([eE][-+]?[\\d']+)?)" + }],relevance:0},c={className:"meta",begin:/#\s*[a-z]+\b/,end:/$/,keywords:{ + keyword:"if else elif endif define undef warning error line pragma _Pragma ifdef ifndef include" + },contains:[{begin:/\\\n/,relevance:0},e.inherit(o,{className:"string"}),{ + className:"string",begin:/<.*?>/},t,e.C_BLOCK_COMMENT_MODE]},d={ + className:"title",begin:n.optional(i)+e.IDENT_RE,relevance:0 + },g=n.optional(i)+e.IDENT_RE+"\\s*\\(",u={ + type:["bool","char","char16_t","char32_t","char8_t","double","float","int","long","short","void","wchar_t","unsigned","signed","const","static"], + keyword:["alignas","alignof","and","and_eq","asm","atomic_cancel","atomic_commit","atomic_noexcept","auto","bitand","bitor","break","case","catch","class","co_await","co_return","co_yield","compl","concept","const_cast|10","consteval","constexpr","constinit","continue","decltype","default","delete","do","dynamic_cast|10","else","enum","explicit","export","extern","false","final","for","friend","goto","if","import","inline","module","mutable","namespace","new","noexcept","not","not_eq","nullptr","operator","or","or_eq","override","private","protected","public","reflexpr","register","reinterpret_cast|10","requires","return","sizeof","static_assert","static_cast|10","struct","switch","synchronized","template","this","thread_local","throw","transaction_safe","transaction_safe_dynamic","true","try","typedef","typeid","typename","union","using","virtual","volatile","while","xor","xor_eq"], + literal:["NULL","false","nullopt","nullptr","true"],built_in:["_Pragma"], + _type_hints:["any","auto_ptr","barrier","binary_semaphore","bitset","complex","condition_variable","condition_variable_any","counting_semaphore","deque","false_type","future","imaginary","initializer_list","istringstream","jthread","latch","lock_guard","multimap","multiset","mutex","optional","ostringstream","packaged_task","pair","promise","priority_queue","queue","recursive_mutex","recursive_timed_mutex","scoped_lock","set","shared_future","shared_lock","shared_mutex","shared_timed_mutex","shared_ptr","stack","string_view","stringstream","timed_mutex","thread","true_type","tuple","unique_lock","unique_ptr","unordered_map","unordered_multimap","unordered_multiset","unordered_set","variant","vector","weak_ptr","wstring","wstring_view"] + },b={className:"function.dispatch",relevance:0,keywords:{ + _hint:["abort","abs","acos","apply","as_const","asin","atan","atan2","calloc","ceil","cerr","cin","clog","cos","cosh","cout","declval","endl","exchange","exit","exp","fabs","floor","fmod","forward","fprintf","fputs","free","frexp","fscanf","future","invoke","isalnum","isalpha","iscntrl","isdigit","isgraph","islower","isprint","ispunct","isspace","isupper","isxdigit","labs","launder","ldexp","log","log10","make_pair","make_shared","make_shared_for_overwrite","make_tuple","make_unique","malloc","memchr","memcmp","memcpy","memset","modf","move","pow","printf","putchar","puts","realloc","scanf","sin","sinh","snprintf","sprintf","sqrt","sscanf","std","stderr","stdin","stdout","strcat","strchr","strcmp","strcpy","strcspn","strlen","strncat","strncmp","strncpy","strpbrk","strrchr","strspn","strstr","swap","tan","tanh","terminate","to_underlying","tolower","toupper","vfprintf","visit","vprintf","vsprintf"] + }, + begin:n.concat(/\b/,/(?!decltype)/,/(?!if)/,/(?!for)/,/(?!switch)/,/(?!while)/,e.IDENT_RE,n.lookahead(/(<[^<>]+>|)\s*\(/)) + },m=[b,c,s,t,e.C_BLOCK_COMMENT_MODE,l,o],p={variants:[{begin:/=/,end:/;/},{ + begin:/\(/,end:/\)/},{beginKeywords:"new throw return else",end:/;/}], + keywords:u,contains:m.concat([{begin:/\(/,end:/\)/,keywords:u, + contains:m.concat(["self"]),relevance:0}]),relevance:0},_={className:"function", + begin:"("+r+"[\\*&\\s]+)+"+g,returnBegin:!0,end:/[{;=]/,excludeEnd:!0, + keywords:u,illegal:/[^\w\s\*&:<>.]/,contains:[{begin:a,keywords:u,relevance:0},{ + begin:g,returnBegin:!0,contains:[d],relevance:0},{begin:/::/,relevance:0},{ + begin:/:/,endsWithParent:!0,contains:[o,l]},{relevance:0,match:/,/},{ + className:"params",begin:/\(/,end:/\)/,keywords:u,relevance:0, + contains:[t,e.C_BLOCK_COMMENT_MODE,o,l,s,{begin:/\(/,end:/\)/,keywords:u, + relevance:0,contains:["self",t,e.C_BLOCK_COMMENT_MODE,o,l,s]}] + },s,t,e.C_BLOCK_COMMENT_MODE,c]};return{name:"C++", + aliases:["cc","c++","h++","hpp","hh","hxx","cxx"],keywords:u,illegal:"",keywords:u,contains:["self",s]},{begin:e.IDENT_RE+"::",keywords:u},{ + match:[/\b(?:enum(?:\s+(?:class|struct))?|class|struct|union)/,/\s+/,/\w+/], + className:{1:"keyword",3:"title.class"}}])}},grmr_csharp:e=>{const n={ + keyword:["abstract","as","base","break","case","catch","class","const","continue","do","else","event","explicit","extern","finally","fixed","for","foreach","goto","if","implicit","in","interface","internal","is","lock","namespace","new","operator","out","override","params","private","protected","public","readonly","record","ref","return","scoped","sealed","sizeof","stackalloc","static","struct","switch","this","throw","try","typeof","unchecked","unsafe","using","virtual","void","volatile","while"].concat(["add","alias","and","ascending","async","await","by","descending","equals","from","get","global","group","init","into","join","let","nameof","not","notnull","on","or","orderby","partial","remove","select","set","unmanaged","value|0","var","when","where","with","yield"]), + built_in:["bool","byte","char","decimal","delegate","double","dynamic","enum","float","int","long","nint","nuint","object","sbyte","short","string","ulong","uint","ushort"], + literal:["default","false","null","true"]},t=e.inherit(e.TITLE_MODE,{ + begin:"[a-zA-Z](\\.?\\w)*"}),a={className:"number",variants:[{ + begin:"\\b(0b[01']+)"},{ + begin:"(-?)\\b([\\d']+(\\.[\\d']*)?|\\.[\\d']+)(u|U|l|L|ul|UL|f|F|b|B)"},{ + begin:"(-?)(\\b0[xX][a-fA-F0-9']+|(\\b[\\d']+(\\.[\\d']*)?|\\.[\\d']+)([eE][-+]?[\\d']+)?)" + }],relevance:0},i={className:"string",begin:'@"',end:'"',contains:[{begin:'""'}] + },r=e.inherit(i,{illegal:/\n/}),s={className:"subst",begin:/\{/,end:/\}/, + keywords:n},o=e.inherit(s,{illegal:/\n/}),l={className:"string",begin:/\$"/, + end:'"',illegal:/\n/,contains:[{begin:/\{\{/},{begin:/\}\}/ + },e.BACKSLASH_ESCAPE,o]},c={className:"string",begin:/\$@"/,end:'"',contains:[{ + begin:/\{\{/},{begin:/\}\}/},{begin:'""'},s]},d=e.inherit(c,{illegal:/\n/, + contains:[{begin:/\{\{/},{begin:/\}\}/},{begin:'""'},o]}) + ;s.contains=[c,l,i,e.APOS_STRING_MODE,e.QUOTE_STRING_MODE,a,e.C_BLOCK_COMMENT_MODE], + o.contains=[d,l,r,e.APOS_STRING_MODE,e.QUOTE_STRING_MODE,a,e.inherit(e.C_BLOCK_COMMENT_MODE,{ + illegal:/\n/})];const g={variants:[c,l,i,e.APOS_STRING_MODE,e.QUOTE_STRING_MODE] + },u={begin:"<",end:">",contains:[{beginKeywords:"in out"},t] + },b=e.IDENT_RE+"(<"+e.IDENT_RE+"(\\s*,\\s*"+e.IDENT_RE+")*>)?(\\[\\])?",m={ + begin:"@"+e.IDENT_RE,relevance:0};return{name:"C#",aliases:["cs","c#"], + keywords:n,illegal:/::/,contains:[e.COMMENT("///","$",{returnBegin:!0, + contains:[{className:"doctag",variants:[{begin:"///",relevance:0},{ + begin:"\x3c!--|--\x3e"},{begin:""}]}] + }),e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE,{className:"meta",begin:"#", + end:"$",keywords:{ + keyword:"if else elif endif define undef warning error line region endregion pragma checksum" + }},g,a,{beginKeywords:"class interface",relevance:0,end:/[{;=]/, + illegal:/[^\s:,]/,contains:[{beginKeywords:"where class" + },t,u,e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE]},{beginKeywords:"namespace", + relevance:0,end:/[{;=]/,illegal:/[^\s:]/, + contains:[t,e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE]},{ + beginKeywords:"record",relevance:0,end:/[{;=]/,illegal:/[^\s:]/, + contains:[t,u,e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE]},{className:"meta", + begin:"^\\s*\\[(?=[\\w])",excludeBegin:!0,end:"\\]",excludeEnd:!0,contains:[{ + className:"string",begin:/"/,end:/"/}]},{ + beginKeywords:"new return throw await else",relevance:0},{className:"function", + begin:"("+b+"\\s+)+"+e.IDENT_RE+"\\s*(<[^=]+>\\s*)?\\(",returnBegin:!0, + end:/\s*[{;=]/,excludeEnd:!0,keywords:n,contains:[{ + beginKeywords:"public private protected static internal protected abstract async extern override unsafe virtual new sealed partial", + relevance:0},{begin:e.IDENT_RE+"\\s*(<[^=]+>\\s*)?\\(",returnBegin:!0, + contains:[e.TITLE_MODE,u],relevance:0},{match:/\(\)/},{className:"params", + begin:/\(/,end:/\)/,excludeBegin:!0,excludeEnd:!0,keywords:n,relevance:0, + contains:[g,a,e.C_BLOCK_COMMENT_MODE] + },e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE]},m]}},grmr_css:e=>{ + const n=e.regex,t=ie(e),a=[e.APOS_STRING_MODE,e.QUOTE_STRING_MODE];return{ + name:"CSS",case_insensitive:!0,illegal:/[=|'\$]/,keywords:{ + keyframePosition:"from to"},classNameAliases:{keyframePosition:"selector-tag"}, + contains:[t.BLOCK_COMMENT,{begin:/-(webkit|moz|ms|o)-(?=[a-z])/ + },t.CSS_NUMBER_MODE,{className:"selector-id",begin:/#[A-Za-z0-9_-]+/,relevance:0 + },{className:"selector-class",begin:"\\.[a-zA-Z-][a-zA-Z0-9_-]*",relevance:0 + },t.ATTRIBUTE_SELECTOR_MODE,{className:"selector-pseudo",variants:[{ + begin:":("+oe.join("|")+")"},{begin:":(:)?("+le.join("|")+")"}] + },t.CSS_VARIABLE,{className:"attribute",begin:"\\b("+ce.join("|")+")\\b"},{ + begin:/:/,end:/[;}{]/, + contains:[t.BLOCK_COMMENT,t.HEXCOLOR,t.IMPORTANT,t.CSS_NUMBER_MODE,...a,{ + begin:/(url|data-uri)\(/,end:/\)/,relevance:0,keywords:{built_in:"url data-uri" + },contains:[...a,{className:"string",begin:/[^)]/,endsWithParent:!0, + excludeEnd:!0}]},t.FUNCTION_DISPATCH]},{begin:n.lookahead(/@/),end:"[{;]", + relevance:0,illegal:/:/,contains:[{className:"keyword",begin:/@-?\w[\w]*(-\w+)*/ + },{begin:/\s/,endsWithParent:!0,excludeEnd:!0,relevance:0,keywords:{ + $pattern:/[a-z-]+/,keyword:"and or not only",attribute:se.join(" ")},contains:[{ + begin:/[a-z-]+(?=:)/,className:"attribute"},...a,t.CSS_NUMBER_MODE]}]},{ + className:"selector-tag",begin:"\\b("+re.join("|")+")\\b"}]}},grmr_diff:e=>{ + const n=e.regex;return{name:"Diff",aliases:["patch"],contains:[{ + className:"meta",relevance:10, + match:n.either(/^@@ +-\d+,\d+ +\+\d+,\d+ +@@/,/^\*\*\* +\d+,\d+ +\*\*\*\*$/,/^--- +\d+,\d+ +----$/) + },{className:"comment",variants:[{ + begin:n.either(/Index: /,/^index/,/={3,}/,/^-{3}/,/^\*{3} /,/^\+{3}/,/^diff --git/), + end:/$/},{match:/^\*{15}$/}]},{className:"addition",begin:/^\+/,end:/$/},{ + className:"deletion",begin:/^-/,end:/$/},{className:"addition",begin:/^!/, + end:/$/}]}},grmr_go:e=>{const n={ + keyword:["break","case","chan","const","continue","default","defer","else","fallthrough","for","func","go","goto","if","import","interface","map","package","range","return","select","struct","switch","type","var"], + type:["bool","byte","complex64","complex128","error","float32","float64","int8","int16","int32","int64","string","uint8","uint16","uint32","uint64","int","uint","uintptr","rune"], + literal:["true","false","iota","nil"], + built_in:["append","cap","close","complex","copy","imag","len","make","new","panic","print","println","real","recover","delete"] + };return{name:"Go",aliases:["golang"],keywords:n,illegal:"{const n=e.regex;return{name:"GraphQL",aliases:["gql"], + case_insensitive:!0,disableAutodetect:!1,keywords:{ + keyword:["query","mutation","subscription","type","input","schema","directive","interface","union","scalar","fragment","enum","on"], + literal:["true","false","null"]}, + contains:[e.HASH_COMMENT_MODE,e.QUOTE_STRING_MODE,e.NUMBER_MODE,{ + scope:"punctuation",match:/[.]{3}/,relevance:0},{scope:"punctuation", + begin:/[\!\(\)\:\=\[\]\{\|\}]{1}/,relevance:0},{scope:"variable",begin:/\$/, + end:/\W/,excludeEnd:!0,relevance:0},{scope:"meta",match:/@\w+/,excludeEnd:!0},{ + scope:"symbol",begin:n.concat(/[_A-Za-z][_0-9A-Za-z]*/,n.lookahead(/\s*:/)), + relevance:0}],illegal:[/[;<']/,/BEGIN/]}},grmr_ini:e=>{const n=e.regex,t={ + className:"number",relevance:0,variants:[{begin:/([+-]+)?[\d]+_[\d_]+/},{ + begin:e.NUMBER_RE}]},a=e.COMMENT();a.variants=[{begin:/;/,end:/$/},{begin:/#/, + end:/$/}];const i={className:"variable",variants:[{begin:/\$[\w\d"][\w\d_]*/},{ + begin:/\$\{(.*?)\}/}]},r={className:"literal", + begin:/\bon|off|true|false|yes|no\b/},s={className:"string", + contains:[e.BACKSLASH_ESCAPE],variants:[{begin:"'''",end:"'''",relevance:10},{ + begin:'"""',end:'"""',relevance:10},{begin:'"',end:'"'},{begin:"'",end:"'"}] + },o={begin:/\[/,end:/\]/,contains:[a,r,i,s,t,"self"],relevance:0 + },l=n.either(/[A-Za-z0-9_-]+/,/"(\\"|[^"])*"/,/'[^']*'/);return{ + name:"TOML, also INI",aliases:["toml"],case_insensitive:!0,illegal:/\S/, + contains:[a,{className:"section",begin:/\[+/,end:/\]+/},{ + begin:n.concat(l,"(\\s*\\.\\s*",l,")*",n.lookahead(/\s*=\s*[^#\s]/)), + className:"attr",starts:{end:/$/,contains:[a,o,r,i,s,t]}}]}},grmr_java:e=>{ + const n=e.regex,t="[\xc0-\u02b8a-zA-Z_$][\xc0-\u02b8a-zA-Z_$0-9]*",a=t+pe("(?:<"+t+"~~~(?:\\s*,\\s*"+t+"~~~)*>)?",/~~~/g,2),i={ + keyword:["synchronized","abstract","private","var","static","if","const ","for","while","strictfp","finally","protected","import","native","final","void","enum","else","break","transient","catch","instanceof","volatile","case","assert","package","default","public","try","switch","continue","throws","protected","public","private","module","requires","exports","do","sealed","yield","permits"], + literal:["false","true","null"], + type:["char","boolean","long","float","int","byte","short","double"], + built_in:["super","this"]},r={className:"meta",begin:"@"+t,contains:[{ + begin:/\(/,end:/\)/,contains:["self"]}]},s={className:"params",begin:/\(/, + end:/\)/,keywords:i,relevance:0,contains:[e.C_BLOCK_COMMENT_MODE],endsParent:!0} + ;return{name:"Java",aliases:["jsp"],keywords:i,illegal:/<\/|#/, + contains:[e.COMMENT("/\\*\\*","\\*/",{relevance:0,contains:[{begin:/\w+@/, + relevance:0},{className:"doctag",begin:"@[A-Za-z]+"}]}),{ + begin:/import java\.[a-z]+\./,keywords:"import",relevance:2 + },e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE,{begin:/"""/,end:/"""/, + className:"string",contains:[e.BACKSLASH_ESCAPE] + },e.APOS_STRING_MODE,e.QUOTE_STRING_MODE,{ + match:[/\b(?:class|interface|enum|extends|implements|new)/,/\s+/,t],className:{ + 1:"keyword",3:"title.class"}},{match:/non-sealed/,scope:"keyword"},{ + begin:[n.concat(/(?!else)/,t),/\s+/,t,/\s+/,/=(?!=)/],className:{1:"type", + 3:"variable",5:"operator"}},{begin:[/record/,/\s+/,t],className:{1:"keyword", + 3:"title.class"},contains:[s,e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE]},{ + beginKeywords:"new throw return else",relevance:0},{ + begin:["(?:"+a+"\\s+)",e.UNDERSCORE_IDENT_RE,/\s*(?=\()/],className:{ + 2:"title.function"},keywords:i,contains:[{className:"params",begin:/\(/, + end:/\)/,keywords:i,relevance:0, + contains:[r,e.APOS_STRING_MODE,e.QUOTE_STRING_MODE,me,e.C_BLOCK_COMMENT_MODE] + },e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE]},me,r]}},grmr_javascript:Oe, + grmr_json:e=>{const n=["true","false","null"],t={scope:"literal", + beginKeywords:n.join(" ")};return{name:"JSON",keywords:{literal:n},contains:[{ + className:"attr",begin:/"(\\.|[^\\"\r\n])*"(?=\s*:)/,relevance:1.01},{ + match:/[{}[\],:]/,className:"punctuation",relevance:0 + },e.QUOTE_STRING_MODE,t,e.C_NUMBER_MODE,e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE], + illegal:"\\S"}},grmr_kotlin:e=>{const n={ + keyword:"abstract as val var vararg get set class object open private protected public noinline crossinline dynamic final enum if else do while for when throw try catch finally import package is in fun override companion reified inline lateinit init interface annotation data sealed internal infix operator out by constructor super tailrec where const inner suspend typealias external expect actual", + built_in:"Byte Short Char Int Long Boolean Float Double Void Unit Nothing", + literal:"true false null"},t={className:"symbol",begin:e.UNDERSCORE_IDENT_RE+"@" + },a={className:"subst",begin:/\$\{/,end:/\}/,contains:[e.C_NUMBER_MODE]},i={ + className:"variable",begin:"\\$"+e.UNDERSCORE_IDENT_RE},r={className:"string", + variants:[{begin:'"""',end:'"""(?=[^"])',contains:[i,a]},{begin:"'",end:"'", + illegal:/\n/,contains:[e.BACKSLASH_ESCAPE]},{begin:'"',end:'"',illegal:/\n/, + contains:[e.BACKSLASH_ESCAPE,i,a]}]};a.contains.push(r);const s={ + className:"meta", + begin:"@(?:file|property|field|get|set|receiver|param|setparam|delegate)\\s*:(?:\\s*"+e.UNDERSCORE_IDENT_RE+")?" + },o={className:"meta",begin:"@"+e.UNDERSCORE_IDENT_RE,contains:[{begin:/\(/, + end:/\)/,contains:[e.inherit(r,{className:"string"}),"self"]}] + },l=me,c=e.COMMENT("/\\*","\\*/",{contains:[e.C_BLOCK_COMMENT_MODE]}),d={ + variants:[{className:"type",begin:e.UNDERSCORE_IDENT_RE},{begin:/\(/,end:/\)/, + contains:[]}]},g=d;return g.variants[1].contains=[d],d.variants[1].contains=[g], + {name:"Kotlin",aliases:["kt","kts"],keywords:n, + contains:[e.COMMENT("/\\*\\*","\\*/",{relevance:0,contains:[{className:"doctag", + begin:"@[A-Za-z]+"}]}),e.C_LINE_COMMENT_MODE,c,{className:"keyword", + begin:/\b(break|continue|return|this)\b/,starts:{contains:[{className:"symbol", + begin:/@\w+/}]}},t,s,o,{className:"function",beginKeywords:"fun",end:"[(]|$", + returnBegin:!0,excludeEnd:!0,keywords:n,relevance:5,contains:[{ + begin:e.UNDERSCORE_IDENT_RE+"\\s*\\(",returnBegin:!0,relevance:0, + contains:[e.UNDERSCORE_TITLE_MODE]},{className:"type",begin://, + keywords:"reified",relevance:0},{className:"params",begin:/\(/,end:/\)/, + endsParent:!0,keywords:n,relevance:0,contains:[{begin:/:/,end:/[=,\/]/, + endsWithParent:!0,contains:[d,e.C_LINE_COMMENT_MODE,c],relevance:0 + },e.C_LINE_COMMENT_MODE,c,s,o,r,e.C_NUMBER_MODE]},c]},{ + begin:[/class|interface|trait/,/\s+/,e.UNDERSCORE_IDENT_RE],beginScope:{ + 3:"title.class"},keywords:"class interface trait",end:/[:\{(]|$/,excludeEnd:!0, + illegal:"extends implements",contains:[{ + beginKeywords:"public protected internal private constructor" + },e.UNDERSCORE_TITLE_MODE,{className:"type",begin://,excludeBegin:!0, + excludeEnd:!0,relevance:0},{className:"type",begin:/[,:]\s*/,end:/[<\(,){\s]|$/, + excludeBegin:!0,returnEnd:!0},s,o]},r,{className:"meta",begin:"^#!/usr/bin/env", + end:"$",illegal:"\n"},l]}},grmr_less:e=>{ + const n=ie(e),t=de,a="[\\w-]+",i="("+a+"|@\\{"+a+"\\})",r=[],s=[],o=e=>({ + className:"string",begin:"~?"+e+".*?"+e}),l=(e,n,t)=>({className:e,begin:n, + relevance:t}),c={$pattern:/[a-z-]+/,keyword:"and or not only", + attribute:se.join(" ")},d={begin:"\\(",end:"\\)",contains:s,keywords:c, + relevance:0} + ;s.push(e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE,o("'"),o('"'),n.CSS_NUMBER_MODE,{ + begin:"(url|data-uri)\\(",starts:{className:"string",end:"[\\)\\n]", + excludeEnd:!0} + },n.HEXCOLOR,d,l("variable","@@?"+a,10),l("variable","@\\{"+a+"\\}"),l("built_in","~?`[^`]*?`"),{ + className:"attribute",begin:a+"\\s*:",end:":",returnBegin:!0,excludeEnd:!0 + },n.IMPORTANT,{beginKeywords:"and not"},n.FUNCTION_DISPATCH);const g=s.concat({ + begin:/\{/,end:/\}/,contains:r}),u={beginKeywords:"when",endsWithParent:!0, + contains:[{beginKeywords:"and not"}].concat(s)},b={begin:i+"\\s*:", + returnBegin:!0,end:/[;}]/,relevance:0,contains:[{begin:/-(webkit|moz|ms|o)-/ + },n.CSS_VARIABLE,{className:"attribute",begin:"\\b("+ce.join("|")+")\\b", + end:/(?=:)/,starts:{endsWithParent:!0,illegal:"[<=$]",relevance:0,contains:s}}] + },m={className:"keyword", + begin:"@(import|media|charset|font-face|(-[a-z]+-)?keyframes|supports|document|namespace|page|viewport|host)\\b", + starts:{end:"[;{}]",keywords:c,returnEnd:!0,contains:s,relevance:0}},p={ + className:"variable",variants:[{begin:"@"+a+"\\s*:",relevance:15},{begin:"@"+a + }],starts:{end:"[;}]",returnEnd:!0,contains:g}},_={variants:[{ + begin:"[\\.#:&\\[>]",end:"[;{}]"},{begin:i,end:/\{/}],returnBegin:!0, + returnEnd:!0,illegal:"[<='$\"]",relevance:0, + contains:[e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE,u,l("keyword","all\\b"),l("variable","@\\{"+a+"\\}"),{ + begin:"\\b("+re.join("|")+")\\b",className:"selector-tag" + },n.CSS_NUMBER_MODE,l("selector-tag",i,0),l("selector-id","#"+i),l("selector-class","\\."+i,0),l("selector-tag","&",0),n.ATTRIBUTE_SELECTOR_MODE,{ + className:"selector-pseudo",begin:":("+oe.join("|")+")"},{ + className:"selector-pseudo",begin:":(:)?("+le.join("|")+")"},{begin:/\(/, + end:/\)/,relevance:0,contains:g},{begin:"!important"},n.FUNCTION_DISPATCH]},h={ + begin:a+":(:)?"+`(${t.join("|")})`,returnBegin:!0,contains:[_]} + ;return r.push(e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE,m,p,h,b,_,u,n.FUNCTION_DISPATCH), + {name:"Less",case_insensitive:!0,illegal:"[=>'/<($\"]",contains:r}}, + grmr_lua:e=>{const n="\\[=*\\[",t="\\]=*\\]",a={begin:n,end:t,contains:["self"] + },i=[e.COMMENT("--(?!"+n+")","$"),e.COMMENT("--"+n,t,{contains:[a],relevance:10 + })];return{name:"Lua",keywords:{$pattern:e.UNDERSCORE_IDENT_RE, + literal:"true false nil", + keyword:"and break do else elseif end for goto if in local not or repeat return then until while", + built_in:"_G _ENV _VERSION __index __newindex __mode __call __metatable __tostring __len __gc __add __sub __mul __div __mod __pow __concat __unm __eq __lt __le assert collectgarbage dofile error getfenv getmetatable ipairs load loadfile loadstring module next pairs pcall print rawequal rawget rawset require select setfenv setmetatable tonumber tostring type unpack xpcall arg self coroutine resume yield status wrap create running debug getupvalue debug sethook getmetatable gethook setmetatable setlocal traceback setfenv getinfo setupvalue getlocal getregistry getfenv io lines write close flush open output type read stderr stdin input stdout popen tmpfile math log max acos huge ldexp pi cos tanh pow deg tan cosh sinh random randomseed frexp ceil floor rad abs sqrt modf asin min mod fmod log10 atan2 exp sin atan os exit setlocale date getenv difftime remove time clock tmpname rename execute package preload loadlib loaded loaders cpath config path seeall string sub upper len gfind rep find match char dump gmatch reverse byte format gsub lower table setn insert getn foreachi maxn foreach concat sort remove" + },contains:i.concat([{className:"function",beginKeywords:"function",end:"\\)", + contains:[e.inherit(e.TITLE_MODE,{ + begin:"([_a-zA-Z]\\w*\\.)*([_a-zA-Z]\\w*:)?[_a-zA-Z]\\w*"}),{className:"params", + begin:"\\(",endsWithParent:!0,contains:i}].concat(i) + },e.C_NUMBER_MODE,e.APOS_STRING_MODE,e.QUOTE_STRING_MODE,{className:"string", + begin:n,end:t,contains:[a],relevance:5}])}},grmr_makefile:e=>{const n={ + className:"variable",variants:[{begin:"\\$\\("+e.UNDERSCORE_IDENT_RE+"\\)", + contains:[e.BACKSLASH_ESCAPE]},{begin:/\$[@%{ + const n={begin:/<\/?[A-Za-z_]/,end:">",subLanguage:"xml",relevance:0},t={ + variants:[{begin:/\[.+?\]\[.*?\]/,relevance:0},{ + begin:/\[.+?\]\(((data|javascript|mailto):|(?:http|ftp)s?:\/\/).*?\)/, + relevance:2},{ + begin:e.regex.concat(/\[.+?\]\(/,/[A-Za-z][A-Za-z0-9+.-]*/,/:\/\/.*?\)/), + relevance:2},{begin:/\[.+?\]\([./?&#].*?\)/,relevance:1},{ + begin:/\[.*?\]\(.*?\)/,relevance:0}],returnBegin:!0,contains:[{match:/\[(?=\])/ + },{className:"string",relevance:0,begin:"\\[",end:"\\]",excludeBegin:!0, + returnEnd:!0},{className:"link",relevance:0,begin:"\\]\\(",end:"\\)", + excludeBegin:!0,excludeEnd:!0},{className:"symbol",relevance:0,begin:"\\]\\[", + end:"\\]",excludeBegin:!0,excludeEnd:!0}]},a={className:"strong",contains:[], + variants:[{begin:/_{2}(?!\s)/,end:/_{2}/},{begin:/\*{2}(?!\s)/,end:/\*{2}/}] + },i={className:"emphasis",contains:[],variants:[{begin:/\*(?![*\s])/,end:/\*/},{ + begin:/_(?![_\s])/,end:/_/,relevance:0}]},r=e.inherit(a,{contains:[] + }),s=e.inherit(i,{contains:[]});a.contains.push(s),i.contains.push(r) + ;let o=[n,t];return[a,i,r,s].forEach((e=>{e.contains=e.contains.concat(o) + })),o=o.concat(a,i),{name:"Markdown",aliases:["md","mkdown","mkd"],contains:[{ + className:"section",variants:[{begin:"^#{1,6}",end:"$",contains:o},{ + begin:"(?=^.+?\\n[=-]{2,}$)",contains:[{begin:"^[=-]*$"},{begin:"^",end:"\\n", + contains:o}]}]},n,{className:"bullet",begin:"^[ \t]*([*+-]|(\\d+\\.))(?=\\s+)", + end:"\\s+",excludeEnd:!0},a,i,{className:"quote",begin:"^>\\s+",contains:o, + end:"$"},{className:"code",variants:[{begin:"(`{3,})[^`](.|\\n)*?\\1`*[ ]*"},{ + begin:"(~{3,})[^~](.|\\n)*?\\1~*[ ]*"},{begin:"```",end:"```+[ ]*$"},{ + begin:"~~~",end:"~~~+[ ]*$"},{begin:"`.+?`"},{begin:"(?=^( {4}|\\t))", + contains:[{begin:"^( {4}|\\t)",end:"(\\n)$"}],relevance:0}]},{ + begin:"^[-\\*]{3,}",end:"$"},t,{begin:/^\[[^\n]+\]:/,returnBegin:!0,contains:[{ + className:"symbol",begin:/\[/,end:/\]/,excludeBegin:!0,excludeEnd:!0},{ + className:"link",begin:/:\s*/,end:/$/,excludeBegin:!0}]}]}},grmr_objectivec:e=>{ + const n=/[a-zA-Z@][a-zA-Z0-9_]*/,t={$pattern:n, + keyword:["@interface","@class","@protocol","@implementation"]};return{ + name:"Objective-C",aliases:["mm","objc","obj-c","obj-c++","objective-c++"], + keywords:{"variable.language":["this","super"],$pattern:n, + keyword:["while","export","sizeof","typedef","const","struct","for","union","volatile","static","mutable","if","do","return","goto","enum","else","break","extern","asm","case","default","register","explicit","typename","switch","continue","inline","readonly","assign","readwrite","self","@synchronized","id","typeof","nonatomic","IBOutlet","IBAction","strong","weak","copy","in","out","inout","bycopy","byref","oneway","__strong","__weak","__block","__autoreleasing","@private","@protected","@public","@try","@property","@end","@throw","@catch","@finally","@autoreleasepool","@synthesize","@dynamic","@selector","@optional","@required","@encode","@package","@import","@defs","@compatibility_alias","__bridge","__bridge_transfer","__bridge_retained","__bridge_retain","__covariant","__contravariant","__kindof","_Nonnull","_Nullable","_Null_unspecified","__FUNCTION__","__PRETTY_FUNCTION__","__attribute__","getter","setter","retain","unsafe_unretained","nonnull","nullable","null_unspecified","null_resettable","class","instancetype","NS_DESIGNATED_INITIALIZER","NS_UNAVAILABLE","NS_REQUIRES_SUPER","NS_RETURNS_INNER_POINTER","NS_INLINE","NS_AVAILABLE","NS_DEPRECATED","NS_ENUM","NS_OPTIONS","NS_SWIFT_UNAVAILABLE","NS_ASSUME_NONNULL_BEGIN","NS_ASSUME_NONNULL_END","NS_REFINED_FOR_SWIFT","NS_SWIFT_NAME","NS_SWIFT_NOTHROW","NS_DURING","NS_HANDLER","NS_ENDHANDLER","NS_VALUERETURN","NS_VOIDRETURN"], + literal:["false","true","FALSE","TRUE","nil","YES","NO","NULL"], + built_in:["dispatch_once_t","dispatch_queue_t","dispatch_sync","dispatch_async","dispatch_once"], + type:["int","float","char","unsigned","signed","short","long","double","wchar_t","unichar","void","bool","BOOL","id|0","_Bool"] + },illegal:"/,end:/$/,illegal:"\\n" + },e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE]},{className:"class", + begin:"("+t.keyword.join("|")+")\\b",end:/(\{|$)/,excludeEnd:!0,keywords:t, + contains:[e.UNDERSCORE_TITLE_MODE]},{begin:"\\."+e.UNDERSCORE_IDENT_RE, + relevance:0}]}},grmr_perl:e=>{const n=e.regex,t=/[dualxmsipngr]{0,12}/,a={ + $pattern:/[\w.]+/, + keyword:"abs accept alarm and atan2 bind binmode bless break caller chdir chmod chomp chop chown chr chroot close closedir connect continue cos crypt dbmclose dbmopen defined delete die do dump each else elsif endgrent endhostent endnetent endprotoent endpwent endservent eof eval exec exists exit exp fcntl fileno flock for foreach fork format formline getc getgrent getgrgid getgrnam gethostbyaddr gethostbyname gethostent getlogin getnetbyaddr getnetbyname getnetent getpeername getpgrp getpriority getprotobyname getprotobynumber getprotoent getpwent getpwnam getpwuid getservbyname getservbyport getservent getsockname getsockopt given glob gmtime goto grep gt hex if index int ioctl join keys kill last lc lcfirst length link listen local localtime log lstat lt ma map mkdir msgctl msgget msgrcv msgsnd my ne next no not oct open opendir or ord our pack package pipe pop pos print printf prototype push q|0 qq quotemeta qw qx rand read readdir readline readlink readpipe recv redo ref rename require reset return reverse rewinddir rindex rmdir say scalar seek seekdir select semctl semget semop send setgrent sethostent setnetent setpgrp setpriority setprotoent setpwent setservent setsockopt shift shmctl shmget shmread shmwrite shutdown sin sleep socket socketpair sort splice split sprintf sqrt srand stat state study sub substr symlink syscall sysopen sysread sysseek system syswrite tell telldir tie tied time times tr truncate uc ucfirst umask undef unless unlink unpack unshift untie until use utime values vec wait waitpid wantarray warn when while write x|0 xor y|0" + },i={className:"subst",begin:"[$@]\\{",end:"\\}",keywords:a},r={begin:/->\{/, + end:/\}/},s={variants:[{begin:/\$\d/},{ + begin:n.concat(/[$%@](\^\w\b|#\w+(::\w+)*|\{\w+\}|\w+(::\w*)*)/,"(?![A-Za-z])(?![@$%])") + },{begin:/[$%@][^\s\w{]/,relevance:0}] + },o=[e.BACKSLASH_ESCAPE,i,s],l=[/!/,/\//,/\|/,/\?/,/'/,/"/,/#/],c=(e,a,i="\\1")=>{ + const r="\\1"===i?i:n.concat(i,a) + ;return n.concat(n.concat("(?:",e,")"),a,/(?:\\.|[^\\\/])*?/,r,/(?:\\.|[^\\\/])*?/,i,t) + },d=(e,a,i)=>n.concat(n.concat("(?:",e,")"),a,/(?:\\.|[^\\\/])*?/,i,t),g=[s,e.HASH_COMMENT_MODE,e.COMMENT(/^=\w/,/=cut/,{ + endsWithParent:!0}),r,{className:"string",contains:o,variants:[{ + begin:"q[qwxr]?\\s*\\(",end:"\\)",relevance:5},{begin:"q[qwxr]?\\s*\\[", + end:"\\]",relevance:5},{begin:"q[qwxr]?\\s*\\{",end:"\\}",relevance:5},{ + begin:"q[qwxr]?\\s*\\|",end:"\\|",relevance:5},{begin:"q[qwxr]?\\s*<",end:">", + relevance:5},{begin:"qw\\s+q",end:"q",relevance:5},{begin:"'",end:"'", + contains:[e.BACKSLASH_ESCAPE]},{begin:'"',end:'"'},{begin:"`",end:"`", + contains:[e.BACKSLASH_ESCAPE]},{begin:/\{\w+\}/,relevance:0},{ + begin:"-?\\w+\\s*=>",relevance:0}]},{className:"number", + begin:"(\\b0[0-7_]+)|(\\b0x[0-9a-fA-F_]+)|(\\b[1-9][0-9_]*(\\.[0-9_]+)?)|[0_]\\b", + relevance:0},{ + begin:"(\\/\\/|"+e.RE_STARTERS_RE+"|\\b(split|return|print|reverse|grep)\\b)\\s*", + keywords:"split return print reverse grep",relevance:0, + contains:[e.HASH_COMMENT_MODE,{className:"regexp",variants:[{ + begin:c("s|tr|y",n.either(...l,{capture:!0}))},{begin:c("s|tr|y","\\(","\\)")},{ + begin:c("s|tr|y","\\[","\\]")},{begin:c("s|tr|y","\\{","\\}")}],relevance:2},{ + className:"regexp",variants:[{begin:/(m|qr)\/\//,relevance:0},{ + begin:d("(?:m|qr)?",/\//,/\//)},{begin:d("m|qr",n.either(...l,{capture:!0 + }),/\1/)},{begin:d("m|qr",/\(/,/\)/)},{begin:d("m|qr",/\[/,/\]/)},{ + begin:d("m|qr",/\{/,/\}/)}]}]},{className:"function",beginKeywords:"sub", + end:"(\\s*\\(.*?\\))?[;{]",excludeEnd:!0,relevance:5,contains:[e.TITLE_MODE]},{ + begin:"-\\w\\b",relevance:0},{begin:"^__DATA__$",end:"^__END__$", + subLanguage:"mojolicious",contains:[{begin:"^@@.*",end:"$",className:"comment"}] + }];return i.contains=g,r.contains=g,{name:"Perl",aliases:["pl","pm"],keywords:a, + contains:g}},grmr_php:e=>{ + const n=e.regex,t=/(?![A-Za-z0-9])(?![$])/,a=n.concat(/[a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*/,t),i=n.concat(/(\\?[A-Z][a-z0-9_\x7f-\xff]+|\\?[A-Z]+(?=[A-Z][a-z0-9_\x7f-\xff])){1,}/,t),r={ + scope:"variable",match:"\\$+"+a},s={scope:"subst",variants:[{begin:/\$\w+/},{ + begin:/\{\$/,end:/\}/}]},o=e.inherit(e.APOS_STRING_MODE,{illegal:null + }),l="[ \t\n]",c={scope:"string",variants:[e.inherit(e.QUOTE_STRING_MODE,{ + illegal:null,contains:e.QUOTE_STRING_MODE.contains.concat(s)}),o,{ + begin:/<<<[ \t]*(?:(\w+)|"(\w+)")\n/,end:/[ \t]*(\w+)\b/, + contains:e.QUOTE_STRING_MODE.contains.concat(s),"on:begin":(e,n)=>{ + n.data._beginMatch=e[1]||e[2]},"on:end":(e,n)=>{ + n.data._beginMatch!==e[1]&&n.ignoreMatch()}},e.END_SAME_AS_BEGIN({ + begin:/<<<[ \t]*'(\w+)'\n/,end:/[ \t]*(\w+)\b/})]},d={scope:"number",variants:[{ + begin:"\\b0[bB][01]+(?:_[01]+)*\\b"},{begin:"\\b0[oO][0-7]+(?:_[0-7]+)*\\b"},{ + begin:"\\b0[xX][\\da-fA-F]+(?:_[\\da-fA-F]+)*\\b"},{ + begin:"(?:\\b\\d+(?:_\\d+)*(\\.(?:\\d+(?:_\\d+)*))?|\\B\\.\\d+)(?:[eE][+-]?\\d+)?" + }],relevance:0 + },g=["false","null","true"],u=["__CLASS__","__DIR__","__FILE__","__FUNCTION__","__COMPILER_HALT_OFFSET__","__LINE__","__METHOD__","__NAMESPACE__","__TRAIT__","die","echo","exit","include","include_once","print","require","require_once","array","abstract","and","as","binary","bool","boolean","break","callable","case","catch","class","clone","const","continue","declare","default","do","double","else","elseif","empty","enddeclare","endfor","endforeach","endif","endswitch","endwhile","enum","eval","extends","final","finally","float","for","foreach","from","global","goto","if","implements","instanceof","insteadof","int","integer","interface","isset","iterable","list","match|0","mixed","new","never","object","or","private","protected","public","readonly","real","return","string","switch","throw","trait","try","unset","use","var","void","while","xor","yield"],b=["Error|0","AppendIterator","ArgumentCountError","ArithmeticError","ArrayIterator","ArrayObject","AssertionError","BadFunctionCallException","BadMethodCallException","CachingIterator","CallbackFilterIterator","CompileError","Countable","DirectoryIterator","DivisionByZeroError","DomainException","EmptyIterator","ErrorException","Exception","FilesystemIterator","FilterIterator","GlobIterator","InfiniteIterator","InvalidArgumentException","IteratorIterator","LengthException","LimitIterator","LogicException","MultipleIterator","NoRewindIterator","OutOfBoundsException","OutOfRangeException","OuterIterator","OverflowException","ParentIterator","ParseError","RangeException","RecursiveArrayIterator","RecursiveCachingIterator","RecursiveCallbackFilterIterator","RecursiveDirectoryIterator","RecursiveFilterIterator","RecursiveIterator","RecursiveIteratorIterator","RecursiveRegexIterator","RecursiveTreeIterator","RegexIterator","RuntimeException","SeekableIterator","SplDoublyLinkedList","SplFileInfo","SplFileObject","SplFixedArray","SplHeap","SplMaxHeap","SplMinHeap","SplObjectStorage","SplObserver","SplPriorityQueue","SplQueue","SplStack","SplSubject","SplTempFileObject","TypeError","UnderflowException","UnexpectedValueException","UnhandledMatchError","ArrayAccess","BackedEnum","Closure","Fiber","Generator","Iterator","IteratorAggregate","Serializable","Stringable","Throwable","Traversable","UnitEnum","WeakReference","WeakMap","Directory","__PHP_Incomplete_Class","parent","php_user_filter","self","static","stdClass"],m={ + keyword:u,literal:(e=>{const n=[];return e.forEach((e=>{ + n.push(e),e.toLowerCase()===e?n.push(e.toUpperCase()):n.push(e.toLowerCase()) + })),n})(g),built_in:b},p=e=>e.map((e=>e.replace(/\|\d+$/,""))),_={variants:[{ + match:[/new/,n.concat(l,"+"),n.concat("(?!",p(b).join("\\b|"),"\\b)"),i],scope:{ + 1:"keyword",4:"title.class"}}]},h=n.concat(a,"\\b(?!\\()"),f={variants:[{ + match:[n.concat(/::/,n.lookahead(/(?!class\b)/)),h],scope:{2:"variable.constant" + }},{match:[/::/,/class/],scope:{2:"variable.language"}},{ + match:[i,n.concat(/::/,n.lookahead(/(?!class\b)/)),h],scope:{1:"title.class", + 3:"variable.constant"}},{match:[i,n.concat("::",n.lookahead(/(?!class\b)/))], + scope:{1:"title.class"}},{match:[i,/::/,/class/],scope:{1:"title.class", + 3:"variable.language"}}]},E={scope:"attr", + match:n.concat(a,n.lookahead(":"),n.lookahead(/(?!::)/))},y={relevance:0, + begin:/\(/,end:/\)/,keywords:m,contains:[E,r,f,e.C_BLOCK_COMMENT_MODE,c,d,_] + },N={relevance:0, + match:[/\b/,n.concat("(?!fn\\b|function\\b|",p(u).join("\\b|"),"|",p(b).join("\\b|"),"\\b)"),a,n.concat(l,"*"),n.lookahead(/(?=\()/)], + scope:{3:"title.function.invoke"},contains:[y]};y.contains.push(N) + ;const w=[E,f,e.C_BLOCK_COMMENT_MODE,c,d,_];return{case_insensitive:!1, + keywords:m,contains:[{begin:n.concat(/#\[\s*/,i),beginScope:"meta",end:/]/, + endScope:"meta",keywords:{literal:g,keyword:["new","array"]},contains:[{ + begin:/\[/,end:/]/,keywords:{literal:g,keyword:["new","array"]}, + contains:["self",...w]},...w,{scope:"meta",match:i}] + },e.HASH_COMMENT_MODE,e.COMMENT("//","$"),e.COMMENT("/\\*","\\*/",{contains:[{ + scope:"doctag",match:"@[A-Za-z]+"}]}),{match:/__halt_compiler\(\);/, + keywords:"__halt_compiler",starts:{scope:"comment",end:e.MATCH_NOTHING_RE, + contains:[{match:/\?>/,scope:"meta",endsParent:!0}]}},{scope:"meta",variants:[{ + begin:/<\?php/,relevance:10},{begin:/<\?=/},{begin:/<\?/,relevance:.1},{ + begin:/\?>/}]},{scope:"variable.language",match:/\$this\b/},r,N,f,{ + match:[/const/,/\s/,a],scope:{1:"keyword",3:"variable.constant"}},_,{ + scope:"function",relevance:0,beginKeywords:"fn function",end:/[;{]/, + excludeEnd:!0,illegal:"[$%\\[]",contains:[{beginKeywords:"use" + },e.UNDERSCORE_TITLE_MODE,{begin:"=>",endsParent:!0},{scope:"params", + begin:"\\(",end:"\\)",excludeBegin:!0,excludeEnd:!0,keywords:m, + contains:["self",r,f,e.C_BLOCK_COMMENT_MODE,c,d]}]},{scope:"class",variants:[{ + beginKeywords:"enum",illegal:/[($"]/},{beginKeywords:"class interface trait", + illegal:/[:($"]/}],relevance:0,end:/\{/,excludeEnd:!0,contains:[{ + beginKeywords:"extends implements"},e.UNDERSCORE_TITLE_MODE]},{ + beginKeywords:"namespace",relevance:0,end:";",illegal:/[.']/, + contains:[e.inherit(e.UNDERSCORE_TITLE_MODE,{scope:"title.class"})]},{ + beginKeywords:"use",relevance:0,end:";",contains:[{ + match:/\b(as|const|function)\b/,scope:"keyword"},e.UNDERSCORE_TITLE_MODE]},c,d]} + },grmr_php_template:e=>({name:"PHP template",subLanguage:"xml",contains:[{ + begin:/<\?(php|=)?/,end:/\?>/,subLanguage:"php",contains:[{begin:"/\\*", + end:"\\*/",skip:!0},{begin:'b"',end:'"',skip:!0},{begin:"b'",end:"'",skip:!0 + },e.inherit(e.APOS_STRING_MODE,{illegal:null,className:null,contains:null, + skip:!0}),e.inherit(e.QUOTE_STRING_MODE,{illegal:null,className:null, + contains:null,skip:!0})]}]}),grmr_plaintext:e=>({name:"Plain text", + aliases:["text","txt"],disableAutodetect:!0}),grmr_python:e=>{ + const n=e.regex,t=/[\p{XID_Start}_]\p{XID_Continue}*/u,a=["and","as","assert","async","await","break","case","class","continue","def","del","elif","else","except","finally","for","from","global","if","import","in","is","lambda","match","nonlocal|10","not","or","pass","raise","return","try","while","with","yield"],i={ + $pattern:/[A-Za-z]\w+|__\w+__/,keyword:a, + built_in:["__import__","abs","all","any","ascii","bin","bool","breakpoint","bytearray","bytes","callable","chr","classmethod","compile","complex","delattr","dict","dir","divmod","enumerate","eval","exec","filter","float","format","frozenset","getattr","globals","hasattr","hash","help","hex","id","input","int","isinstance","issubclass","iter","len","list","locals","map","max","memoryview","min","next","object","oct","open","ord","pow","print","property","range","repr","reversed","round","set","setattr","slice","sorted","staticmethod","str","sum","super","tuple","type","vars","zip"], + literal:["__debug__","Ellipsis","False","None","NotImplemented","True"], + type:["Any","Callable","Coroutine","Dict","List","Literal","Generic","Optional","Sequence","Set","Tuple","Type","Union"] + },r={className:"meta",begin:/^(>>>|\.\.\.) /},s={className:"subst",begin:/\{/, + end:/\}/,keywords:i,illegal:/#/},o={begin:/\{\{/,relevance:0},l={ + className:"string",contains:[e.BACKSLASH_ESCAPE],variants:[{ + begin:/([uU]|[bB]|[rR]|[bB][rR]|[rR][bB])?'''/,end:/'''/, + contains:[e.BACKSLASH_ESCAPE,r],relevance:10},{ + begin:/([uU]|[bB]|[rR]|[bB][rR]|[rR][bB])?"""/,end:/"""/, + contains:[e.BACKSLASH_ESCAPE,r],relevance:10},{ + begin:/([fF][rR]|[rR][fF]|[fF])'''/,end:/'''/, + contains:[e.BACKSLASH_ESCAPE,r,o,s]},{begin:/([fF][rR]|[rR][fF]|[fF])"""/, + end:/"""/,contains:[e.BACKSLASH_ESCAPE,r,o,s]},{begin:/([uU]|[rR])'/,end:/'/, + relevance:10},{begin:/([uU]|[rR])"/,end:/"/,relevance:10},{ + begin:/([bB]|[bB][rR]|[rR][bB])'/,end:/'/},{begin:/([bB]|[bB][rR]|[rR][bB])"/, + end:/"/},{begin:/([fF][rR]|[rR][fF]|[fF])'/,end:/'/, + contains:[e.BACKSLASH_ESCAPE,o,s]},{begin:/([fF][rR]|[rR][fF]|[fF])"/,end:/"/, + contains:[e.BACKSLASH_ESCAPE,o,s]},e.APOS_STRING_MODE,e.QUOTE_STRING_MODE] + },c="[0-9](_?[0-9])*",d=`(\\b(${c}))?\\.(${c})|\\b(${c})\\.`,g="\\b|"+a.join("|"),u={ + className:"number",relevance:0,variants:[{ + begin:`(\\b(${c})|(${d}))[eE][+-]?(${c})[jJ]?(?=${g})`},{begin:`(${d})[jJ]?`},{ + begin:`\\b([1-9](_?[0-9])*|0+(_?0)*)[lLjJ]?(?=${g})`},{ + begin:`\\b0[bB](_?[01])+[lL]?(?=${g})`},{begin:`\\b0[oO](_?[0-7])+[lL]?(?=${g})` + },{begin:`\\b0[xX](_?[0-9a-fA-F])+[lL]?(?=${g})`},{begin:`\\b(${c})[jJ](?=${g})` + }]},b={className:"comment",begin:n.lookahead(/# type:/),end:/$/,keywords:i, + contains:[{begin:/# type:/},{begin:/#/,end:/\b\B/,endsWithParent:!0}]},m={ + className:"params",variants:[{className:"",begin:/\(\s*\)/,skip:!0},{begin:/\(/, + end:/\)/,excludeBegin:!0,excludeEnd:!0,keywords:i, + contains:["self",r,u,l,e.HASH_COMMENT_MODE]}]};return s.contains=[l,u,r],{ + name:"Python",aliases:["py","gyp","ipython"],unicodeRegex:!0,keywords:i, + illegal:/(<\/|\?)|=>/,contains:[r,u,{begin:/\bself\b/},{beginKeywords:"if", + relevance:0},l,b,e.HASH_COMMENT_MODE,{match:[/\bdef/,/\s+/,t],scope:{ + 1:"keyword",3:"title.function"},contains:[m]},{variants:[{ + match:[/\bclass/,/\s+/,t,/\s*/,/\(\s*/,t,/\s*\)/]},{match:[/\bclass/,/\s+/,t]}], + scope:{1:"keyword",3:"title.class",6:"title.class.inherited"}},{ + className:"meta",begin:/^[\t ]*@/,end:/(?=#)|$/,contains:[u,m,l]}]}}, + grmr_python_repl:e=>({aliases:["pycon"],contains:[{className:"meta.prompt", + starts:{end:/ |$/,starts:{end:"$",subLanguage:"python"}},variants:[{ + begin:/^>>>(?=[ ]|$)/},{begin:/^\.\.\.(?=[ ]|$)/}]}]}),grmr_r:e=>{ + const n=e.regex,t=/(?:(?:[a-zA-Z]|\.[._a-zA-Z])[._a-zA-Z0-9]*)|\.(?!\d)/,a=n.either(/0[xX][0-9a-fA-F]+\.[0-9a-fA-F]*[pP][+-]?\d+i?/,/0[xX][0-9a-fA-F]+(?:[pP][+-]?\d+)?[Li]?/,/(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?[Li]?/),i=/[=!<>:]=|\|\||&&|:::?|<-|<<-|->>|->|\|>|[-+*\/?!$&|:<=>@^~]|\*\*/,r=n.either(/[()]/,/[{}]/,/\[\[/,/[[\]]/,/\\/,/,/) + ;return{name:"R",keywords:{$pattern:t, + keyword:"function if in break next repeat else for while", + literal:"NULL NA TRUE FALSE Inf NaN NA_integer_|10 NA_real_|10 NA_character_|10 NA_complex_|10", + built_in:"LETTERS letters month.abb month.name pi T F abs acos acosh all any anyNA Arg as.call as.character as.complex as.double as.environment as.integer as.logical as.null.default as.numeric as.raw asin asinh atan atanh attr attributes baseenv browser c call ceiling class Conj cos cosh cospi cummax cummin cumprod cumsum digamma dim dimnames emptyenv exp expression floor forceAndCall gamma gc.time globalenv Im interactive invisible is.array is.atomic is.call is.character is.complex is.double is.environment is.expression is.finite is.function is.infinite is.integer is.language is.list is.logical is.matrix is.na is.name is.nan is.null is.numeric is.object is.pairlist is.raw is.recursive is.single is.symbol lazyLoadDBfetch length lgamma list log max min missing Mod names nargs nzchar oldClass on.exit pos.to.env proc.time prod quote range Re rep retracemem return round seq_along seq_len seq.int sign signif sin sinh sinpi sqrt standardGeneric substitute sum switch tan tanh tanpi tracemem trigamma trunc unclass untracemem UseMethod xtfrm" + },contains:[e.COMMENT(/#'/,/$/,{contains:[{scope:"doctag",match:/@examples/, + starts:{end:n.lookahead(n.either(/\n^#'\s*(?=@[a-zA-Z]+)/,/\n^(?!#')/)), + endsParent:!0}},{scope:"doctag",begin:"@param",end:/$/,contains:[{ + scope:"variable",variants:[{match:t},{match:/`(?:\\.|[^`\\])+`/}],endsParent:!0 + }]},{scope:"doctag",match:/@[a-zA-Z]+/},{scope:"keyword",match:/\\[a-zA-Z]+/}] + }),e.HASH_COMMENT_MODE,{scope:"string",contains:[e.BACKSLASH_ESCAPE], + variants:[e.END_SAME_AS_BEGIN({begin:/[rR]"(-*)\(/,end:/\)(-*)"/ + }),e.END_SAME_AS_BEGIN({begin:/[rR]"(-*)\{/,end:/\}(-*)"/ + }),e.END_SAME_AS_BEGIN({begin:/[rR]"(-*)\[/,end:/\](-*)"/ + }),e.END_SAME_AS_BEGIN({begin:/[rR]'(-*)\(/,end:/\)(-*)'/ + }),e.END_SAME_AS_BEGIN({begin:/[rR]'(-*)\{/,end:/\}(-*)'/ + }),e.END_SAME_AS_BEGIN({begin:/[rR]'(-*)\[/,end:/\](-*)'/}),{begin:'"',end:'"', + relevance:0},{begin:"'",end:"'",relevance:0}]},{relevance:0,variants:[{scope:{ + 1:"operator",2:"number"},match:[i,a]},{scope:{1:"operator",2:"number"}, + match:[/%[^%]*%/,a]},{scope:{1:"punctuation",2:"number"},match:[r,a]},{scope:{ + 2:"number"},match:[/[^a-zA-Z0-9._]|^/,a]}]},{scope:{3:"operator"}, + match:[t,/\s+/,/<-/,/\s+/]},{scope:"operator",relevance:0,variants:[{match:i},{ + match:/%[^%]*%/}]},{scope:"punctuation",relevance:0,match:r},{begin:"`",end:"`", + contains:[{begin:/\\./}]}]}},grmr_ruby:e=>{ + const n=e.regex,t="([a-zA-Z_]\\w*[!?=]?|[-+~]@|<<|>>|=~|===?|<=>|[<>]=?|\\*\\*|[-/+%^&*~`|]|\\[\\]=?)",a=n.either(/\b([A-Z]+[a-z0-9]+)+/,/\b([A-Z]+[a-z0-9]+)+[A-Z]+/),i=n.concat(a,/(::\w+)*/),r={ + "variable.constant":["__FILE__","__LINE__","__ENCODING__"], + "variable.language":["self","super"], + keyword:["alias","and","begin","BEGIN","break","case","class","defined","do","else","elsif","end","END","ensure","for","if","in","module","next","not","or","redo","require","rescue","retry","return","then","undef","unless","until","when","while","yield","include","extend","prepend","public","private","protected","raise","throw"], + built_in:["proc","lambda","attr_accessor","attr_reader","attr_writer","define_method","private_constant","module_function"], + literal:["true","false","nil"]},s={className:"doctag",begin:"@[A-Za-z]+"},o={ + begin:"#<",end:">"},l=[e.COMMENT("#","$",{contains:[s] + }),e.COMMENT("^=begin","^=end",{contains:[s],relevance:10 + }),e.COMMENT("^__END__",e.MATCH_NOTHING_RE)],c={className:"subst",begin:/#\{/, + end:/\}/,keywords:r},d={className:"string",contains:[e.BACKSLASH_ESCAPE,c], + variants:[{begin:/'/,end:/'/},{begin:/"/,end:/"/},{begin:/`/,end:/`/},{ + begin:/%[qQwWx]?\(/,end:/\)/},{begin:/%[qQwWx]?\[/,end:/\]/},{ + begin:/%[qQwWx]?\{/,end:/\}/},{begin:/%[qQwWx]?/},{begin:/%[qQwWx]?\//, + end:/\//},{begin:/%[qQwWx]?%/,end:/%/},{begin:/%[qQwWx]?-/,end:/-/},{ + begin:/%[qQwWx]?\|/,end:/\|/},{begin:/\B\?(\\\d{1,3})/},{ + begin:/\B\?(\\x[A-Fa-f0-9]{1,2})/},{begin:/\B\?(\\u\{?[A-Fa-f0-9]{1,6}\}?)/},{ + begin:/\B\?(\\M-\\C-|\\M-\\c|\\c\\M-|\\M-|\\C-\\M-)[\x20-\x7e]/},{ + begin:/\B\?\\(c|C-)[\x20-\x7e]/},{begin:/\B\?\\?\S/},{ + begin:n.concat(/<<[-~]?'?/,n.lookahead(/(\w+)(?=\W)[^\n]*\n(?:[^\n]*\n)*?\s*\1\b/)), + contains:[e.END_SAME_AS_BEGIN({begin:/(\w+)/,end:/(\w+)/, + contains:[e.BACKSLASH_ESCAPE,c]})]}]},g="[0-9](_?[0-9])*",u={className:"number", + relevance:0,variants:[{ + begin:`\\b([1-9](_?[0-9])*|0)(\\.(${g}))?([eE][+-]?(${g})|r)?i?\\b`},{ + begin:"\\b0[dD][0-9](_?[0-9])*r?i?\\b"},{begin:"\\b0[bB][0-1](_?[0-1])*r?i?\\b" + },{begin:"\\b0[oO][0-7](_?[0-7])*r?i?\\b"},{ + begin:"\\b0[xX][0-9a-fA-F](_?[0-9a-fA-F])*r?i?\\b"},{ + begin:"\\b0(_?[0-7])+r?i?\\b"}]},b={variants:[{match:/\(\)/},{ + className:"params",begin:/\(/,end:/(?=\))/,excludeBegin:!0,endsParent:!0, + keywords:r}]},m=[d,{variants:[{match:[/class\s+/,i,/\s+<\s+/,i]},{ + match:[/\b(class|module)\s+/,i]}],scope:{2:"title.class", + 4:"title.class.inherited"},keywords:r},{match:[/(include|extend)\s+/,i],scope:{ + 2:"title.class"},keywords:r},{relevance:0,match:[i,/\.new[. (]/],scope:{ + 1:"title.class"}},{relevance:0,match:/\b[A-Z][A-Z_0-9]+\b/, + className:"variable.constant"},{relevance:0,match:a,scope:"title.class"},{ + match:[/def/,/\s+/,t],scope:{1:"keyword",3:"title.function"},contains:[b]},{ + begin:e.IDENT_RE+"::"},{className:"symbol", + begin:e.UNDERSCORE_IDENT_RE+"(!|\\?)?:",relevance:0},{className:"symbol", + begin:":(?!\\s)",contains:[d,{begin:t}],relevance:0},u,{className:"variable", + begin:"(\\$\\W)|((\\$|@@?)(\\w+))(?=[^@$?])(?![A-Za-z])(?![@$?'])"},{ + className:"params",begin:/\|/,end:/\|/,excludeBegin:!0,excludeEnd:!0, + relevance:0,keywords:r},{begin:"("+e.RE_STARTERS_RE+"|unless)\\s*", + keywords:"unless",contains:[{className:"regexp",contains:[e.BACKSLASH_ESCAPE,c], + illegal:/\n/,variants:[{begin:"/",end:"/[a-z]*"},{begin:/%r\{/,end:/\}[a-z]*/},{ + begin:"%r\\(",end:"\\)[a-z]*"},{begin:"%r!",end:"![a-z]*"},{begin:"%r\\[", + end:"\\][a-z]*"}]}].concat(o,l),relevance:0}].concat(o,l) + ;c.contains=m,b.contains=m;const p=[{begin:/^\s*=>/,starts:{end:"$",contains:m} + },{className:"meta.prompt", + begin:"^([>?]>|[\\w#]+\\(\\w+\\):\\d+:\\d+[>*]|(\\w+-)?\\d+\\.\\d+\\.\\d+(p\\d+)?[^\\d][^>]+>)(?=[ ])", + starts:{end:"$",keywords:r,contains:m}}];return l.unshift(o),{name:"Ruby", + aliases:["rb","gemspec","podspec","thor","irb"],keywords:r,illegal:/\/\*/, + contains:[e.SHEBANG({binary:"ruby"})].concat(p).concat(l).concat(m)}}, + grmr_rust:e=>{const n=e.regex,t={className:"title.function.invoke",relevance:0, + begin:n.concat(/\b/,/(?!let|for|while|if|else|match\b)/,e.IDENT_RE,n.lookahead(/\s*\(/)) + },a="([ui](8|16|32|64|128|size)|f(32|64))?",i=["drop ","Copy","Send","Sized","Sync","Drop","Fn","FnMut","FnOnce","ToOwned","Clone","Debug","PartialEq","PartialOrd","Eq","Ord","AsRef","AsMut","Into","From","Default","Iterator","Extend","IntoIterator","DoubleEndedIterator","ExactSizeIterator","SliceConcatExt","ToString","assert!","assert_eq!","bitflags!","bytes!","cfg!","col!","concat!","concat_idents!","debug_assert!","debug_assert_eq!","env!","eprintln!","panic!","file!","format!","format_args!","include_bytes!","include_str!","line!","local_data_key!","module_path!","option_env!","print!","println!","select!","stringify!","try!","unimplemented!","unreachable!","vec!","write!","writeln!","macro_rules!","assert_ne!","debug_assert_ne!"],r=["i8","i16","i32","i64","i128","isize","u8","u16","u32","u64","u128","usize","f32","f64","str","char","bool","Box","Option","Result","String","Vec"] + ;return{name:"Rust",aliases:["rs"],keywords:{$pattern:e.IDENT_RE+"!?",type:r, + keyword:["abstract","as","async","await","become","box","break","const","continue","crate","do","dyn","else","enum","extern","false","final","fn","for","if","impl","in","let","loop","macro","match","mod","move","mut","override","priv","pub","ref","return","self","Self","static","struct","super","trait","true","try","type","typeof","unsafe","unsized","use","virtual","where","while","yield"], + literal:["true","false","Some","None","Ok","Err"],built_in:i},illegal:""},t]}}, + grmr_scss:e=>{const n=ie(e),t=le,a=oe,i="@[a-z-]+",r={className:"variable", + begin:"(\\$[a-zA-Z-][a-zA-Z0-9_-]*)\\b",relevance:0};return{name:"SCSS", + case_insensitive:!0,illegal:"[=/|']", + contains:[e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE,n.CSS_NUMBER_MODE,{ + className:"selector-id",begin:"#[A-Za-z0-9_-]+",relevance:0},{ + className:"selector-class",begin:"\\.[A-Za-z0-9_-]+",relevance:0 + },n.ATTRIBUTE_SELECTOR_MODE,{className:"selector-tag", + begin:"\\b("+re.join("|")+")\\b",relevance:0},{className:"selector-pseudo", + begin:":("+a.join("|")+")"},{className:"selector-pseudo", + begin:":(:)?("+t.join("|")+")"},r,{begin:/\(/,end:/\)/, + contains:[n.CSS_NUMBER_MODE]},n.CSS_VARIABLE,{className:"attribute", + begin:"\\b("+ce.join("|")+")\\b"},{ + begin:"\\b(whitespace|wait|w-resize|visible|vertical-text|vertical-ideographic|uppercase|upper-roman|upper-alpha|underline|transparent|top|thin|thick|text|text-top|text-bottom|tb-rl|table-header-group|table-footer-group|sw-resize|super|strict|static|square|solid|small-caps|separate|se-resize|scroll|s-resize|rtl|row-resize|ridge|right|repeat|repeat-y|repeat-x|relative|progress|pointer|overline|outside|outset|oblique|nowrap|not-allowed|normal|none|nw-resize|no-repeat|no-drop|newspaper|ne-resize|n-resize|move|middle|medium|ltr|lr-tb|lowercase|lower-roman|lower-alpha|loose|list-item|line|line-through|line-edge|lighter|left|keep-all|justify|italic|inter-word|inter-ideograph|inside|inset|inline|inline-block|inherit|inactive|ideograph-space|ideograph-parenthesis|ideograph-numeric|ideograph-alpha|horizontal|hidden|help|hand|groove|fixed|ellipsis|e-resize|double|dotted|distribute|distribute-space|distribute-letter|distribute-all-lines|disc|disabled|default|decimal|dashed|crosshair|collapse|col-resize|circle|char|center|capitalize|break-word|break-all|bottom|both|bolder|bold|block|bidi-override|below|baseline|auto|always|all-scroll|absolute|table|table-cell)\\b" + },{begin:/:/,end:/[;}{]/,relevance:0, + contains:[n.BLOCK_COMMENT,r,n.HEXCOLOR,n.CSS_NUMBER_MODE,e.QUOTE_STRING_MODE,e.APOS_STRING_MODE,n.IMPORTANT,n.FUNCTION_DISPATCH] + },{begin:"@(page|font-face)",keywords:{$pattern:i,keyword:"@page @font-face"}},{ + begin:"@",end:"[{;]",returnBegin:!0,keywords:{$pattern:/[a-z-]+/, + keyword:"and or not only",attribute:se.join(" ")},contains:[{begin:i, + className:"keyword"},{begin:/[a-z-]+(?=:)/,className:"attribute" + },r,e.QUOTE_STRING_MODE,e.APOS_STRING_MODE,n.HEXCOLOR,n.CSS_NUMBER_MODE] + },n.FUNCTION_DISPATCH]}},grmr_shell:e=>({name:"Shell Session", + aliases:["console","shellsession"],contains:[{className:"meta.prompt", + begin:/^\s{0,3}[/~\w\d[\]()@-]*[>%$#][ ]?/,starts:{end:/[^\\](?=\s*$)/, + subLanguage:"bash"}}]}),grmr_sql:e=>{ + const n=e.regex,t=e.COMMENT("--","$"),a=["true","false","unknown"],i=["bigint","binary","blob","boolean","char","character","clob","date","dec","decfloat","decimal","float","int","integer","interval","nchar","nclob","national","numeric","real","row","smallint","time","timestamp","varchar","varying","varbinary"],r=["abs","acos","array_agg","asin","atan","avg","cast","ceil","ceiling","coalesce","corr","cos","cosh","count","covar_pop","covar_samp","cume_dist","dense_rank","deref","element","exp","extract","first_value","floor","json_array","json_arrayagg","json_exists","json_object","json_objectagg","json_query","json_table","json_table_primitive","json_value","lag","last_value","lead","listagg","ln","log","log10","lower","max","min","mod","nth_value","ntile","nullif","percent_rank","percentile_cont","percentile_disc","position","position_regex","power","rank","regr_avgx","regr_avgy","regr_count","regr_intercept","regr_r2","regr_slope","regr_sxx","regr_sxy","regr_syy","row_number","sin","sinh","sqrt","stddev_pop","stddev_samp","substring","substring_regex","sum","tan","tanh","translate","translate_regex","treat","trim","trim_array","unnest","upper","value_of","var_pop","var_samp","width_bucket"],s=["create table","insert into","primary key","foreign key","not null","alter table","add constraint","grouping sets","on overflow","character set","respect nulls","ignore nulls","nulls first","nulls last","depth first","breadth first"],o=r,l=["abs","acos","all","allocate","alter","and","any","are","array","array_agg","array_max_cardinality","as","asensitive","asin","asymmetric","at","atan","atomic","authorization","avg","begin","begin_frame","begin_partition","between","bigint","binary","blob","boolean","both","by","call","called","cardinality","cascaded","case","cast","ceil","ceiling","char","char_length","character","character_length","check","classifier","clob","close","coalesce","collate","collect","column","commit","condition","connect","constraint","contains","convert","copy","corr","corresponding","cos","cosh","count","covar_pop","covar_samp","create","cross","cube","cume_dist","current","current_catalog","current_date","current_default_transform_group","current_path","current_role","current_row","current_schema","current_time","current_timestamp","current_path","current_role","current_transform_group_for_type","current_user","cursor","cycle","date","day","deallocate","dec","decimal","decfloat","declare","default","define","delete","dense_rank","deref","describe","deterministic","disconnect","distinct","double","drop","dynamic","each","element","else","empty","end","end_frame","end_partition","end-exec","equals","escape","every","except","exec","execute","exists","exp","external","extract","false","fetch","filter","first_value","float","floor","for","foreign","frame_row","free","from","full","function","fusion","get","global","grant","group","grouping","groups","having","hold","hour","identity","in","indicator","initial","inner","inout","insensitive","insert","int","integer","intersect","intersection","interval","into","is","join","json_array","json_arrayagg","json_exists","json_object","json_objectagg","json_query","json_table","json_table_primitive","json_value","lag","language","large","last_value","lateral","lead","leading","left","like","like_regex","listagg","ln","local","localtime","localtimestamp","log","log10","lower","match","match_number","match_recognize","matches","max","member","merge","method","min","minute","mod","modifies","module","month","multiset","national","natural","nchar","nclob","new","no","none","normalize","not","nth_value","ntile","null","nullif","numeric","octet_length","occurrences_regex","of","offset","old","omit","on","one","only","open","or","order","out","outer","over","overlaps","overlay","parameter","partition","pattern","per","percent","percent_rank","percentile_cont","percentile_disc","period","portion","position","position_regex","power","precedes","precision","prepare","primary","procedure","ptf","range","rank","reads","real","recursive","ref","references","referencing","regr_avgx","regr_avgy","regr_count","regr_intercept","regr_r2","regr_slope","regr_sxx","regr_sxy","regr_syy","release","result","return","returns","revoke","right","rollback","rollup","row","row_number","rows","running","savepoint","scope","scroll","search","second","seek","select","sensitive","session_user","set","show","similar","sin","sinh","skip","smallint","some","specific","specifictype","sql","sqlexception","sqlstate","sqlwarning","sqrt","start","static","stddev_pop","stddev_samp","submultiset","subset","substring","substring_regex","succeeds","sum","symmetric","system","system_time","system_user","table","tablesample","tan","tanh","then","time","timestamp","timezone_hour","timezone_minute","to","trailing","translate","translate_regex","translation","treat","trigger","trim","trim_array","true","truncate","uescape","union","unique","unknown","unnest","update","upper","user","using","value","values","value_of","var_pop","var_samp","varbinary","varchar","varying","versioning","when","whenever","where","width_bucket","window","with","within","without","year","add","asc","collation","desc","final","first","last","view"].filter((e=>!r.includes(e))),c={ + begin:n.concat(/\b/,n.either(...o),/\s*\(/),relevance:0,keywords:{built_in:o}} + ;return{name:"SQL",case_insensitive:!0,illegal:/[{}]|<\//,keywords:{ + $pattern:/\b[\w\.]+/,keyword:((e,{exceptions:n,when:t}={})=>{const a=t + ;return n=n||[],e.map((e=>e.match(/\|\d+$/)||n.includes(e)?e:a(e)?e+"|0":e)) + })(l,{when:e=>e.length<3}),literal:a,type:i, + built_in:["current_catalog","current_date","current_default_transform_group","current_path","current_role","current_schema","current_transform_group_for_type","current_user","session_user","system_time","system_user","current_time","localtime","current_timestamp","localtimestamp"] + },contains:[{begin:n.either(...s),relevance:0,keywords:{$pattern:/[\w\.]+/, + keyword:l.concat(s),literal:a,type:i}},{className:"type", + begin:n.either("double precision","large object","with timezone","without timezone") + },c,{className:"variable",begin:/@[a-z0-9][a-z0-9_]*/},{className:"string", + variants:[{begin:/'/,end:/'/,contains:[{begin:/''/}]}]},{begin:/"/,end:/"/, + contains:[{begin:/""/}]},e.C_NUMBER_MODE,e.C_BLOCK_COMMENT_MODE,t,{ + className:"operator",begin:/[-+*/=%^~]|&&?|\|\|?|!=?|<(?:=>?|<|>)?|>[>=]?/, + relevance:0}]}},grmr_swift:e=>{const n={match:/\s+/,relevance:0 + },t=e.COMMENT("/\\*","\\*/",{contains:["self"]}),a=[e.C_LINE_COMMENT_MODE,t],i={ + match:[/\./,m(...xe,...Me)],className:{2:"keyword"}},r={match:b(/\./,m(...Ae)), + relevance:0},s=Ae.filter((e=>"string"==typeof e)).concat(["_|0"]),o={variants:[{ + className:"keyword", + match:m(...Ae.filter((e=>"string"!=typeof e)).concat(Se).map(ke),...Me)}]},l={ + $pattern:m(/\b\w+/,/#\w+/),keyword:s.concat(Re),literal:Ce},c=[i,r,o],g=[{ + match:b(/\./,m(...De)),relevance:0},{className:"built_in", + match:b(/\b/,m(...De),/(?=\()/)}],u={match:/->/,relevance:0},p=[u,{ + className:"operator",relevance:0,variants:[{match:Be},{match:`\\.(\\.|${Le})+`}] + }],_="([0-9]_*)+",h="([0-9a-fA-F]_*)+",f={className:"number",relevance:0, + variants:[{match:`\\b(${_})(\\.(${_}))?([eE][+-]?(${_}))?\\b`},{ + match:`\\b0x(${h})(\\.(${h}))?([pP][+-]?(${_}))?\\b`},{match:/\b0o([0-7]_*)+\b/ + },{match:/\b0b([01]_*)+\b/}]},E=(e="")=>({className:"subst",variants:[{ + match:b(/\\/,e,/[0\\tnr"']/)},{match:b(/\\/,e,/u\{[0-9a-fA-F]{1,8}\}/)}] + }),y=(e="")=>({className:"subst",match:b(/\\/,e,/[\t ]*(?:[\r\n]|\r\n)/) + }),N=(e="")=>({className:"subst",label:"interpol",begin:b(/\\/,e,/\(/),end:/\)/ + }),w=(e="")=>({begin:b(e,/"""/),end:b(/"""/,e),contains:[E(e),y(e),N(e)] + }),v=(e="")=>({begin:b(e,/"/),end:b(/"/,e),contains:[E(e),N(e)]}),O={ + className:"string", + variants:[w(),w("#"),w("##"),w("###"),v(),v("#"),v("##"),v("###")] + },k=[e.BACKSLASH_ESCAPE,{begin:/\[/,end:/\]/,relevance:0, + contains:[e.BACKSLASH_ESCAPE]}],x={begin:/\/[^\s](?=[^/\n]*\/)/,end:/\//, + contains:k},M=e=>{const n=b(e,/\//),t=b(/\//,e);return{begin:n,end:t, + contains:[...k,{scope:"comment",begin:`#(?!.*${t})`,end:/$/}]}},S={ + scope:"regexp",variants:[M("###"),M("##"),M("#"),x]},A={match:b(/`/,Fe,/`/) + },C=[A,{className:"variable",match:/\$\d+/},{className:"variable", + match:`\\$${ze}+`}],T=[{match:/(@|#(un)?)available/,scope:"keyword",starts:{ + contains:[{begin:/\(/,end:/\)/,keywords:Pe,contains:[...p,f,O]}]}},{ + scope:"keyword",match:b(/@/,m(...je))},{scope:"meta",match:b(/@/,Fe)}],R={ + match:d(/\b[A-Z]/),relevance:0,contains:[{className:"type", + match:b(/(AV|CA|CF|CG|CI|CL|CM|CN|CT|MK|MP|MTK|MTL|NS|SCN|SK|UI|WK|XC)/,ze,"+") + },{className:"type",match:Ue,relevance:0},{match:/[?!]+/,relevance:0},{ + match:/\.\.\./,relevance:0},{match:b(/\s+&\s+/,d(Ue)),relevance:0}]},D={ + begin://,keywords:l,contains:[...a,...c,...T,u,R]};R.contains.push(D) + ;const I={begin:/\(/,end:/\)/,relevance:0,keywords:l,contains:["self",{ + match:b(Fe,/\s*:/),keywords:"_|0",relevance:0 + },...a,S,...c,...g,...p,f,O,...C,...T,R]},L={begin://, + keywords:"repeat each",contains:[...a,R]},B={begin:/\(/,end:/\)/,keywords:l, + contains:[{begin:m(d(b(Fe,/\s*:/)),d(b(Fe,/\s+/,Fe,/\s*:/))),end:/:/, + relevance:0,contains:[{className:"keyword",match:/\b_\b/},{className:"params", + match:Fe}]},...a,...c,...p,f,O,...T,R,I],endsParent:!0,illegal:/["']/},$={ + match:[/(func|macro)/,/\s+/,m(A.match,Fe,Be)],className:{1:"keyword", + 3:"title.function"},contains:[L,B,n],illegal:[/\[/,/%/]},z={ + match:[/\b(?:subscript|init[?!]?)/,/\s*(?=[<(])/],className:{1:"keyword"}, + contains:[L,B,n],illegal:/\[|%/},F={match:[/operator/,/\s+/,Be],className:{ + 1:"keyword",3:"title"}},U={begin:[/precedencegroup/,/\s+/,Ue],className:{ + 1:"keyword",3:"title"},contains:[R],keywords:[...Te,...Ce],end:/}/} + ;for(const e of O.variants){const n=e.contains.find((e=>"interpol"===e.label)) + ;n.keywords=l;const t=[...c,...g,...p,f,O,...C];n.contains=[...t,{begin:/\(/, + end:/\)/,contains:["self",...t]}]}return{name:"Swift",keywords:l, + contains:[...a,$,z,{beginKeywords:"struct protocol class extension enum actor", + end:"\\{",excludeEnd:!0,keywords:l,contains:[e.inherit(e.TITLE_MODE,{ + className:"title.class",begin:/[A-Za-z$_][\u00C0-\u02B80-9A-Za-z$_]*/}),...c] + },F,U,{beginKeywords:"import",end:/$/,contains:[...a],relevance:0 + },S,...c,...g,...p,f,O,...C,...T,R,I]}},grmr_typescript:e=>{ + const n=Oe(e),t=_e,a=["any","void","number","boolean","string","object","never","symbol","bigint","unknown"],i={ + beginKeywords:"namespace",end:/\{/,excludeEnd:!0, + contains:[n.exports.CLASS_REFERENCE]},r={beginKeywords:"interface",end:/\{/, + excludeEnd:!0,keywords:{keyword:"interface extends",built_in:a}, + contains:[n.exports.CLASS_REFERENCE]},s={$pattern:_e, + keyword:he.concat(["type","namespace","interface","public","private","protected","implements","declare","abstract","readonly","enum","override"]), + literal:fe,built_in:ve.concat(a),"variable.language":we},o={className:"meta", + begin:"@"+t},l=(e,n,t)=>{const a=e.contains.findIndex((e=>e.label===n)) + ;if(-1===a)throw Error("can not find mode to replace");e.contains.splice(a,1,t)} + ;return Object.assign(n.keywords,s), + n.exports.PARAMS_CONTAINS.push(o),n.contains=n.contains.concat([o,i,r]), + l(n,"shebang",e.SHEBANG()),l(n,"use_strict",{className:"meta",relevance:10, + begin:/^\s*['"]use strict['"]/ + }),n.contains.find((e=>"func.def"===e.label)).relevance=0,Object.assign(n,{ + name:"TypeScript",aliases:["ts","tsx","mts","cts"]}),n},grmr_vbnet:e=>{ + const n=e.regex,t=/\d{1,2}\/\d{1,2}\/\d{4}/,a=/\d{4}-\d{1,2}-\d{1,2}/,i=/(\d|1[012])(:\d+){0,2} *(AM|PM)/,r=/\d{1,2}(:\d{1,2}){1,2}/,s={ + className:"literal",variants:[{begin:n.concat(/# */,n.either(a,t),/ *#/)},{ + begin:n.concat(/# */,r,/ *#/)},{begin:n.concat(/# */,i,/ *#/)},{ + begin:n.concat(/# */,n.either(a,t),/ +/,n.either(i,r),/ *#/)}] + },o=e.COMMENT(/'''/,/$/,{contains:[{className:"doctag",begin:/<\/?/,end:/>/}] + }),l=e.COMMENT(null,/$/,{variants:[{begin:/'/},{begin:/([\t ]|^)REM(?=\s)/}]}) + ;return{name:"Visual Basic .NET",aliases:["vb"],case_insensitive:!0, + classNameAliases:{label:"symbol"},keywords:{ + keyword:"addhandler alias aggregate ansi as async assembly auto binary by byref byval call case catch class compare const continue custom declare default delegate dim distinct do each equals else elseif end enum erase error event exit explicit finally for friend from function get global goto group handles if implements imports in inherits interface into iterator join key let lib loop me mid module mustinherit mustoverride mybase myclass namespace narrowing new next notinheritable notoverridable of off on operator option optional order overloads overridable overrides paramarray partial preserve private property protected public raiseevent readonly redim removehandler resume return select set shadows shared skip static step stop structure strict sub synclock take text then throw to try unicode until using when where while widening with withevents writeonly yield", + built_in:"addressof and andalso await directcast gettype getxmlnamespace is isfalse isnot istrue like mod nameof new not or orelse trycast typeof xor cbool cbyte cchar cdate cdbl cdec cint clng cobj csbyte cshort csng cstr cuint culng cushort", + type:"boolean byte char date decimal double integer long object sbyte short single string uinteger ulong ushort", + literal:"true false nothing"}, + illegal:"//|\\{|\\}|endif|gosub|variant|wend|^\\$ ",contains:[{ + className:"string",begin:/"(""|[^/n])"C\b/},{className:"string",begin:/"/, + end:/"/,illegal:/\n/,contains:[{begin:/""/}]},s,{className:"number",relevance:0, + variants:[{begin:/\b\d[\d_]*((\.[\d_]+(E[+-]?[\d_]+)?)|(E[+-]?[\d_]+))[RFD@!#]?/ + },{begin:/\b\d[\d_]*((U?[SIL])|[%&])?/},{begin:/&H[\dA-F_]+((U?[SIL])|[%&])?/},{ + begin:/&O[0-7_]+((U?[SIL])|[%&])?/},{begin:/&B[01_]+((U?[SIL])|[%&])?/}]},{ + className:"label",begin:/^\w+:/},o,l,{className:"meta", + begin:/[\t ]*#(const|disable|else|elseif|enable|end|externalsource|if|region)\b/, + end:/$/,keywords:{ + keyword:"const disable else elseif enable end externalsource if region then"}, + contains:[l]}]}},grmr_wasm:e=>{e.regex;const n=e.COMMENT(/\(;/,/;\)/) + ;return n.contains.push("self"),{name:"WebAssembly",keywords:{$pattern:/[\w.]+/, + keyword:["anyfunc","block","br","br_if","br_table","call","call_indirect","data","drop","elem","else","end","export","func","global.get","global.set","local.get","local.set","local.tee","get_global","get_local","global","if","import","local","loop","memory","memory.grow","memory.size","module","mut","nop","offset","param","result","return","select","set_global","set_local","start","table","tee_local","then","type","unreachable"] + },contains:[e.COMMENT(/;;/,/$/),n,{match:[/(?:offset|align)/,/\s*/,/=/], + className:{1:"keyword",3:"operator"}},{className:"variable",begin:/\$[\w_]+/},{ + match:/(\((?!;)|\))+/,className:"punctuation",relevance:0},{ + begin:[/(?:func|call|call_indirect)/,/\s+/,/\$[^\s)]+/],className:{1:"keyword", + 3:"title.function"}},e.QUOTE_STRING_MODE,{match:/(i32|i64|f32|f64)(?!\.)/, + className:"type"},{className:"keyword", + match:/\b(f32|f64|i32|i64)(?:\.(?:abs|add|and|ceil|clz|const|convert_[su]\/i(?:32|64)|copysign|ctz|demote\/f64|div(?:_[su])?|eqz?|extend_[su]\/i32|floor|ge(?:_[su])?|gt(?:_[su])?|le(?:_[su])?|load(?:(?:8|16|32)_[su])?|lt(?:_[su])?|max|min|mul|nearest|neg?|or|popcnt|promote\/f32|reinterpret\/[fi](?:32|64)|rem_[su]|rot[lr]|shl|shr_[su]|store(?:8|16|32)?|sqrt|sub|trunc(?:_[su]\/f(?:32|64))?|wrap\/i64|xor))\b/ + },{className:"number",relevance:0, + match:/[+-]?\b(?:\d(?:_?\d)*(?:\.\d(?:_?\d)*)?(?:[eE][+-]?\d(?:_?\d)*)?|0x[\da-fA-F](?:_?[\da-fA-F])*(?:\.[\da-fA-F](?:_?[\da-fA-D])*)?(?:[pP][+-]?\d(?:_?\d)*)?)\b|\binf\b|\bnan(?::0x[\da-fA-F](?:_?[\da-fA-D])*)?\b/ + }]}},grmr_xml:e=>{ + const n=e.regex,t=n.concat(/[\p{L}_]/u,n.optional(/[\p{L}0-9_.-]*:/u),/[\p{L}0-9_.-]*/u),a={ + className:"symbol",begin:/&[a-z]+;|&#[0-9]+;|&#x[a-f0-9]+;/},i={begin:/\s/, + contains:[{className:"keyword",begin:/#?[a-z_][a-z1-9_-]+/,illegal:/\n/}] + },r=e.inherit(i,{begin:/\(/,end:/\)/}),s=e.inherit(e.APOS_STRING_MODE,{ + className:"string"}),o=e.inherit(e.QUOTE_STRING_MODE,{className:"string"}),l={ + endsWithParent:!0,illegal:/`]+/}]}]}]};return{ + name:"HTML, XML", + aliases:["html","xhtml","rss","atom","xjb","xsd","xsl","plist","wsf","svg"], + case_insensitive:!0,unicodeRegex:!0,contains:[{className:"meta",begin://,relevance:10,contains:[i,o,s,r,{begin:/\[/,end:/\]/,contains:[{ + className:"meta",begin://,contains:[i,r,o,s]}]}] + },e.COMMENT(//,{relevance:10}),{begin://, + relevance:10},a,{className:"meta",end:/\?>/,variants:[{begin:/<\?xml/, + relevance:10,contains:[o]},{begin:/<\?[a-z][a-z0-9]+/}]},{className:"tag", + begin:/)/,end:/>/,keywords:{name:"style"},contains:[l],starts:{ + end:/<\/style>/,returnEnd:!0,subLanguage:["css","xml"]}},{className:"tag", + begin:/)/,end:/>/,keywords:{name:"script"},contains:[l],starts:{ + end:/<\/script>/,returnEnd:!0,subLanguage:["javascript","handlebars","xml"]}},{ + className:"tag",begin:/<>|<\/>/},{className:"tag", + begin:n.concat(//,/>/,/\s/)))), + end:/\/?>/,contains:[{className:"name",begin:t,relevance:0,starts:l}]},{ + className:"tag",begin:n.concat(/<\//,n.lookahead(n.concat(t,/>/))),contains:[{ + className:"name",begin:t,relevance:0},{begin:/>/,relevance:0,endsParent:!0}]}]} + },grmr_yaml:e=>{ + const n="true false yes no null",t="[\\w#;/?:@&=+$,.~*'()[\\]]+",a={ + className:"string",relevance:0,variants:[{begin:/'/,end:/'/},{begin:/"/,end:/"/ + },{begin:/\S+/}],contains:[e.BACKSLASH_ESCAPE,{className:"template-variable", + variants:[{begin:/\{\{/,end:/\}\}/},{begin:/%\{/,end:/\}/}]}]},i=e.inherit(a,{ + variants:[{begin:/'/,end:/'/},{begin:/"/,end:/"/},{begin:/[^\s,{}[\]]+/}]}),r={ + end:",",endsWithParent:!0,excludeEnd:!0,keywords:n,relevance:0},s={begin:/\{/, + end:/\}/,contains:[r],illegal:"\\n",relevance:0},o={begin:"\\[",end:"\\]", + contains:[r],illegal:"\\n",relevance:0},l=[{className:"attr",variants:[{ + begin:"\\w[\\w :\\/.-]*:(?=[ \t]|$)"},{begin:'"\\w[\\w :\\/.-]*":(?=[ \t]|$)'},{ + begin:"'\\w[\\w :\\/.-]*':(?=[ \t]|$)"}]},{className:"meta",begin:"^---\\s*$", + relevance:10},{className:"string", + begin:"[\\|>]([1-9]?[+-])?[ ]*\\n( +)[^ ][^\\n]*\\n(\\2[^\\n]+\\n?)*"},{ + begin:"<%[%=-]?",end:"[%-]?%>",subLanguage:"ruby",excludeBegin:!0,excludeEnd:!0, + relevance:0},{className:"type",begin:"!\\w+!"+t},{className:"type", + begin:"!<"+t+">"},{className:"type",begin:"!"+t},{className:"type",begin:"!!"+t + },{className:"meta",begin:"&"+e.UNDERSCORE_IDENT_RE+"$"},{className:"meta", + begin:"\\*"+e.UNDERSCORE_IDENT_RE+"$"},{className:"bullet",begin:"-(?=[ ]|$)", + relevance:0},e.HASH_COMMENT_MODE,{beginKeywords:n,keywords:{literal:n}},{ + className:"number", + begin:"\\b[0-9]{4}(-[0-9][0-9]){0,2}([Tt \\t][0-9][0-9]?(:[0-9][0-9]){2})?(\\.[0-9]*)?([ \\t])*(Z|[-+][0-9][0-9]?(:[0-9][0-9])?)?\\b" + },{className:"number",begin:e.C_NUMBER_RE+"\\b",relevance:0},s,o,a],c=[...l] + ;return c.pop(),c.push(i),r.contains=c,{name:"YAML",case_insensitive:!0, + aliases:["yml"],contains:l}}});const He=ae;for(const e of Object.keys(Ke)){ + const n=e.replace("grmr_","").replace("_","-");He.registerLanguage(n,Ke[e])} + return He}() + ;"object"==typeof exports&&"undefined"!=typeof module&&(module.exports=hljs); \ No newline at end of file diff --git a/docs/md_v2/assets/highlight_init.js b/docs/md_v2/assets/highlight_init.js new file mode 100644 index 0000000000000000000000000000000000000000..e237927864de512708ffc76fe5f7b94d28959328 --- /dev/null +++ b/docs/md_v2/assets/highlight_init.js @@ -0,0 +1,6 @@ +document.addEventListener('DOMContentLoaded', (event) => { + document.querySelectorAll('pre code').forEach((block) => { + hljs.highlightBlock(block); + }); + }); + \ No newline at end of file diff --git a/docs/md_v2/assets/styles.css b/docs/md_v2/assets/styles.css new file mode 100644 index 0000000000000000000000000000000000000000..68a93f5d18788db5d1e015e6aebb1fdebca82cc4 --- /dev/null +++ b/docs/md_v2/assets/styles.css @@ -0,0 +1,160 @@ +@font-face { + font-family: "Monaco"; + font-style: normal; + font-weight: normal; + src: local("Monaco"), url("Monaco.woff") format("woff"); +} + +:root { + --global-font-size: 16px; + --global-line-height: 1.5em; + --global-space: 10px; + --font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono, + Courier New, monospace, serif; + --font-stack: dm, Monaco, Courier New, monospace, serif; + --mono-font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono, + Courier New, monospace, serif; + + --background-color: #151515; /* Dark background */ + --font-color: #eaeaea; /* Light font color for contrast */ + --invert-font-color: #151515; /* Dark color for inverted elements */ + --primary-color: #1a95e0; /* Primary color can remain the same or be adjusted for better contrast */ + --secondary-color: #727578; /* Secondary color for less important text */ + --error-color: #ff5555; /* Bright color for errors */ + --progress-bar-background: #444; /* Darker background for progress bar */ + --progress-bar-fill: #1a95e0; /* Bright color for progress bar fill */ + --code-bg-color: #1e1e1e; /* Darker background for code blocks */ + --input-style: solid; /* Keeping input style solid */ + --block-background-color: #202020; /* Darker background for block elements */ + --global-font-color: #eaeaea; /* Light font color for global elements */ + + --background-color: #222225; + + --background-color: #070708; + --page-width: 70em; + --font-color: #e8e9ed; + --invert-font-color: #222225; + --secondary-color: #a3abba; + --secondary-color: #d5cec0; + --tertiary-color: #a3abba; + --primary-color: #09b5a5; /* Updated to the brand color */ + --primary-color: #50ffff; /* Updated to the brand color */ + --error-color: #ff3c74; + --progress-bar-background: #3f3f44; + --progress-bar-fill: #09b5a5; /* Updated to the brand color */ + --code-bg-color: #3f3f44; + --input-style: solid; + --display-h1-decoration: none; + + --display-h1-decoration: none; +} + +/* body { + background-color: var(--background-color); + color: var(--font-color); +} + +a { + color: var(--primary-color); +} + +a:hover { + background-color: var(--primary-color); + color: var(--invert-font-color); +} + +blockquote::after { + color: #444; +} + +pre, code { + background-color: var(--code-bg-color); + color: var(--font-color); +} + +.terminal-nav:first-child { + border-bottom: 1px dashed var(--secondary-color); +} */ + +.terminal-mkdocs-main-content { + line-height: var(--global-line-height); +} + +strong, +.highlight { + /* background: url(//s2.svgbox.net/pen-brushes.svg?ic=brush-1&color=50ffff); */ + background-color: #50ffff33; +} + +.terminal-card > header { + color: var(--font-color); + text-align: center; + background-color: var(--progress-bar-background); + padding: 0.3em 0.5em; +} +.btn.btn-sm { + color: var(--font-color); + padding: 0.2em 0.5em; + font-size: 0.8em; +} + +.loading-message { + display: none; + margin-top: 20px; +} + +.response-section { + display: none; + padding-top: 20px; +} + +.tabs { + display: flex; + flex-direction: column; +} +.tab-list { + display: flex; + padding: 0; + margin: 0; + list-style-type: none; + border-bottom: 1px solid var(--font-color); +} +.tab-item { + cursor: pointer; + padding: 10px; + border: 1px solid var(--font-color); + margin-right: -1px; + border-bottom: none; +} +.tab-item:hover, +.tab-item:focus, +.tab-item:active { + background-color: var(--progress-bar-background); +} +.tab-content { + display: none; + border: 1px solid var(--font-color); + border-top: none; +} +.tab-content:first-of-type { + display: block; +} + +.tab-content header { + padding: 0.5em; + display: flex; + justify-content: end; + align-items: center; + background-color: var(--progress-bar-background); +} +.tab-content pre { + margin: 0; + max-height: 300px; overflow: auto; border:none; +} + +ol li::before { + content: counters(item, ".") ". "; + counter-increment: item; + /* float: left; */ + /* padding-right: 5px; */ +} \ No newline at end of file diff --git a/docs/md_v2/basic/browser-config.md b/docs/md_v2/basic/browser-config.md new file mode 100644 index 0000000000000000000000000000000000000000..7df4a97bbed715f21221f3973adf3310d267296b --- /dev/null +++ b/docs/md_v2/basic/browser-config.md @@ -0,0 +1,208 @@ +# Browser Configuration + +Crawl4AI supports multiple browser engines and offers extensive configuration options for browser behavior. + +## Browser Types + +Choose from three browser engines: + +```python +# Chromium (default) +async with AsyncWebCrawler(browser_type="chromium") as crawler: + result = await crawler.arun(url="https://example.com") + +# Firefox +async with AsyncWebCrawler(browser_type="firefox") as crawler: + result = await crawler.arun(url="https://example.com") + +# WebKit +async with AsyncWebCrawler(browser_type="webkit") as crawler: + result = await crawler.arun(url="https://example.com") +``` + +## Basic Configuration + +Common browser settings: + +```python +async with AsyncWebCrawler( + headless=True, # Run in headless mode (no GUI) + verbose=True, # Enable detailed logging + sleep_on_close=False # No delay when closing browser +) as crawler: + result = await crawler.arun(url="https://example.com") +``` + +## Identity Management + +Control how your crawler appears to websites: + +```python +# Custom user agent +async with AsyncWebCrawler( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" +) as crawler: + result = await crawler.arun(url="https://example.com") + +# Custom headers +headers = { + "Accept-Language": "en-US,en;q=0.9", + "Cache-Control": "no-cache" +} +async with AsyncWebCrawler(headers=headers) as crawler: + result = await crawler.arun(url="https://example.com") +``` + +## Screenshot Capabilities + +Capture page screenshots with enhanced error handling: + +```python +result = await crawler.arun( + url="https://example.com", + screenshot=True, # Enable screenshot + screenshot_wait_for=2.0 # Wait 2 seconds before capture +) + +if result.screenshot: # Base64 encoded image + import base64 + with open("screenshot.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) +``` + +## Timeouts and Waiting + +Control page loading behavior: + +```python +result = await crawler.arun( + url="https://example.com", + page_timeout=60000, # Page load timeout (ms) + delay_before_return_html=2.0, # Wait before content capture + wait_for="css:.dynamic-content" # Wait for specific element +) +``` + +## JavaScript Execution + +Execute custom JavaScript before crawling: + +```python +# Single JavaScript command +result = await crawler.arun( + url="https://example.com", + js_code="window.scrollTo(0, document.body.scrollHeight);" +) + +# Multiple commands +js_commands = [ + "window.scrollTo(0, document.body.scrollHeight);", + "document.querySelector('.load-more').click();" +] +result = await crawler.arun( + url="https://example.com", + js_code=js_commands +) +``` + +## Proxy Configuration + +Use proxies for enhanced access: + +```python +# Simple proxy +async with AsyncWebCrawler( + proxy="http://proxy.example.com:8080" +) as crawler: + result = await crawler.arun(url="https://example.com") + +# Proxy with authentication +proxy_config = { + "server": "http://proxy.example.com:8080", + "username": "user", + "password": "pass" +} +async with AsyncWebCrawler(proxy_config=proxy_config) as crawler: + result = await crawler.arun(url="https://example.com") +``` + +## Anti-Detection Features + +Enable stealth features to avoid bot detection: + +```python +result = await crawler.arun( + url="https://example.com", + simulate_user=True, # Simulate human behavior + override_navigator=True, # Mask automation signals + magic=True # Enable all anti-detection features +) +``` + +## Handling Dynamic Content + +Configure browser to handle dynamic content: + +```python +# Wait for dynamic content +result = await crawler.arun( + url="https://example.com", + wait_for="js:() => document.querySelector('.content').children.length > 10", + process_iframes=True # Process iframe content +) + +# Handle lazy-loaded images +result = await crawler.arun( + url="https://example.com", + js_code="window.scrollTo(0, document.body.scrollHeight);", + delay_before_return_html=2.0 # Wait for images to load +) +``` + +## Comprehensive Example + +Here's how to combine various browser configurations: + +```python +async def crawl_with_advanced_config(url: str): + async with AsyncWebCrawler( + # Browser setup + browser_type="chromium", + headless=True, + verbose=True, + + # Identity + user_agent="Custom User Agent", + headers={"Accept-Language": "en-US"}, + + # Proxy setup + proxy="http://proxy.example.com:8080" + ) as crawler: + result = await crawler.arun( + url=url, + # Content handling + process_iframes=True, + screenshot=True, + + # Timing + page_timeout=60000, + delay_before_return_html=2.0, + + # Anti-detection + magic=True, + simulate_user=True, + + # Dynamic content + js_code=[ + "window.scrollTo(0, document.body.scrollHeight);", + "document.querySelector('.load-more')?.click();" + ], + wait_for="css:.dynamic-content" + ) + + return { + "content": result.markdown, + "screenshot": result.screenshot, + "success": result.success + } +``` \ No newline at end of file diff --git a/docs/md_v2/basic/cache-modes.md b/docs/md_v2/basic/cache-modes.md new file mode 100644 index 0000000000000000000000000000000000000000..73460e571b7989d375e5e88792990d1a61af742a --- /dev/null +++ b/docs/md_v2/basic/cache-modes.md @@ -0,0 +1,81 @@ +# Crawl4AI Cache System and Migration Guide + +## Overview +Starting from version 0.5.0, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable. + +## Old vs New Approach + +### Old Way (Deprecated) +The old system used multiple boolean flags: +- `bypass_cache`: Skip cache entirely +- `disable_cache`: Disable all caching +- `no_cache_read`: Don't read from cache +- `no_cache_write`: Don't write to cache + +### New Way (Recommended) +The new system uses a single `CacheMode` enum: +- `CacheMode.ENABLED`: Normal caching (read/write) +- `CacheMode.DISABLED`: No caching at all +- `CacheMode.READ_ONLY`: Only read from cache +- `CacheMode.WRITE_ONLY`: Only write to cache +- `CacheMode.BYPASS`: Skip cache for this operation + +## Migration Example + +### Old Code (Deprecated) +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def use_proxy(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + bypass_cache=True # Old way + ) + print(len(result.markdown)) + +async def main(): + await use_proxy() + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### New Code (Recommended) +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.async_configs import CrawlerRunConfig + +async def use_proxy(): + config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) # Use CacheMode in CrawlerRunConfig + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + config=config # Pass the configuration object + ) + print(len(result.markdown)) + +async def main(): + await use_proxy() + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Common Migration Patterns + +| Old Flag | New Mode | +|-----------------------|---------------------------------| +| `bypass_cache=True` | `cache_mode=CacheMode.BYPASS` | +| `disable_cache=True` | `cache_mode=CacheMode.DISABLED`| +| `no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY` | +| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` | + +## Suppressing Deprecation Warnings +If you need time to migrate, you can temporarily suppress deprecation warnings: +```python +# In your config.py +SHOW_DEPRECATION_WARNINGS = False +``` diff --git a/docs/md_v2/basic/content-selection.md b/docs/md_v2/basic/content-selection.md new file mode 100644 index 0000000000000000000000000000000000000000..ec838f2d9870f5a1a68208284daf51b4a40622a9 --- /dev/null +++ b/docs/md_v2/basic/content-selection.md @@ -0,0 +1,135 @@ +### Content Selection + +Crawl4AI provides multiple ways to select and filter specific content from webpages. Learn how to precisely target the content you need. + +#### CSS Selectors + +Extract specific content using a `CrawlerRunConfig` with CSS selectors: + +```python +from crawl4ai.async_configs import CrawlerRunConfig + +config = CrawlerRunConfig(css_selector=".main-article") # Target main article content +result = await crawler.arun(url="https://crawl4ai.com", config=config) + +config = CrawlerRunConfig(css_selector="article h1, article .content") # Target heading and content +result = await crawler.arun(url="https://crawl4ai.com", config=config) +``` + +#### Content Filtering + +Control content inclusion or exclusion with `CrawlerRunConfig`: + +```python +config = CrawlerRunConfig( + word_count_threshold=10, # Minimum words per block + excluded_tags=['form', 'header', 'footer', 'nav'], # Excluded tags + exclude_external_links=True, # Remove external links + exclude_social_media_links=True, # Remove social media links + exclude_external_images=True # Remove external images +) + +result = await crawler.arun(url="https://crawl4ai.com", config=config) +``` + +#### Iframe Content + +Process iframe content by enabling specific options in `CrawlerRunConfig`: + +```python +config = CrawlerRunConfig( + process_iframes=True, # Extract iframe content + remove_overlay_elements=True # Remove popups/modals that might block iframes +) + +result = await crawler.arun(url="https://crawl4ai.com", config=config) +``` + +#### Structured Content Selection Using LLMs + +Leverage LLMs for intelligent content extraction: + +```python +from crawl4ai.extraction_strategy import LLMExtractionStrategy +from pydantic import BaseModel +from typing import List + +class ArticleContent(BaseModel): + title: str + main_points: List[str] + conclusion: str + +strategy = LLMExtractionStrategy( + provider="ollama/nemotron", + schema=ArticleContent.schema(), + instruction="Extract the main article title, key points, and conclusion" +) + +config = CrawlerRunConfig(extraction_strategy=strategy) + +result = await crawler.arun(url="https://crawl4ai.com", config=config) +article = json.loads(result.extracted_content) +``` + +#### Pattern-Based Selection + +Extract content matching repetitive patterns: + +```python +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +schema = { + "name": "News Articles", + "baseSelector": "article.news-item", + "fields": [ + {"name": "headline", "selector": "h2", "type": "text"}, + {"name": "summary", "selector": ".summary", "type": "text"}, + {"name": "category", "selector": ".category", "type": "text"}, + { + "name": "metadata", + "type": "nested", + "fields": [ + {"name": "author", "selector": ".author", "type": "text"}, + {"name": "date", "selector": ".date", "type": "text"} + ] + } + ] +} + +strategy = JsonCssExtractionStrategy(schema) +config = CrawlerRunConfig(extraction_strategy=strategy) + +result = await crawler.arun(url="https://crawl4ai.com", config=config) +articles = json.loads(result.extracted_content) +``` + +#### Comprehensive Example + +Combine different selection methods using `CrawlerRunConfig`: + +```python +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig + +async def extract_article_content(url: str): + # Define structured extraction + article_schema = { + "name": "Article", + "baseSelector": "article.main", + "fields": [ + {"name": "title", "selector": "h1", "type": "text"}, + {"name": "content", "selector": ".content", "type": "text"} + ] + } + + # Define configuration + config = CrawlerRunConfig( + extraction_strategy=JsonCssExtractionStrategy(article_schema), + word_count_threshold=10, + excluded_tags=['nav', 'footer'], + exclude_external_links=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url=url, config=config) + return json.loads(result.extracted_content) +``` diff --git a/docs/md_v2/basic/content_filtering.md b/docs/md_v2/basic/content_filtering.md new file mode 100644 index 0000000000000000000000000000000000000000..14f48ec6bf51aa1ebe6f18068c2a518409bf766c --- /dev/null +++ b/docs/md_v2/basic/content_filtering.md @@ -0,0 +1,83 @@ +# Content Filtering in Crawl4AI + +This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies. + +## Relevance Content Filter + +The `RelevanceContentFilter` is an abstract class providing a common interface for content filtering strategies. Specific algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. + +## Pruning Content Filter + +The `PruningContentFilter` removes less relevant nodes based on metrics like text density, link density, and tag importance. Nodes that fall below a defined threshold are pruned, leaving only high-value content. + +### Usage + +```python +from crawl4ai.async_configs import CrawlerRunConfig +from crawl4ai.content_filter_strategy import PruningContentFilter + +config = CrawlerRunConfig( + content_filter=PruningContentFilter( + min_word_threshold=5, + threshold_type='dynamic', + threshold=0.45 + ), + fit_markdown=True # Activates markdown fitting +) + +result = await crawler.arun(url="https://example.com", config=config) + +if result.success: + print(f"Cleaned Markdown:\n{result.fit_markdown}") +``` + +### Parameters + +- **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned. +- **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated: + - `'fixed'`: Uses a constant threshold value for all nodes. + - `'dynamic'`: Adjusts thresholds based on node properties (e.g., tag importance, text/link ratios). +- **`threshold`**: (Optional, default 0.48) Base threshold for pruning: + - Fixed: Nodes scoring below this value are removed. + - Dynamic: This value adjusts based on node characteristics. + +### How It Works + +The algorithm evaluates each node using: +- **Text density**: Ratio of text to overall content. +- **Link density**: Proportion of text within links. +- **Tag importance**: Weights based on HTML tag type (e.g., `
    `, `

    `, `

    `). +- **Content quality**: Metrics like text length and structural importance. + +## BM25 Algorithm + +The `BM25ContentFilter` uses the BM25 algorithm to rank and extract text chunks based on relevance to a search query or page metadata. + +### Usage + +```python +from crawl4ai.async_configs import CrawlerRunConfig +from crawl4ai.content_filter_strategy import BM25ContentFilter + +config = CrawlerRunConfig( + content_filter=BM25ContentFilter(user_query="fruit nutrition health"), + fit_markdown=True # Activates markdown fitting +) + +result = await crawler.arun(url="https://example.com", config=config) + +if result.success: + print(f"Filtered Content:\n{result.extracted_content}") + print(f"\nFiltered Markdown:\n{result.fit_markdown}") + print(f"\nFiltered HTML:\n{result.fit_html}") +else: + print("Error:", result.error_message) +``` + +### Parameters + +- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts metadata (title, description, keywords) and uses it as the query. +- **`bm25_threshold`**: (Optional, default 1.0) Threshold controlling relevance: + - Higher values return stricter, more relevant results. + - Lower values include more lenient filtering. + diff --git a/docs/md_v2/basic/docker-deploymeny.md b/docs/md_v2/basic/docker-deploymeny.md new file mode 100644 index 0000000000000000000000000000000000000000..31d33e8ce82f349a938f7c94b9860a35d6351574 --- /dev/null +++ b/docs/md_v2/basic/docker-deploymeny.md @@ -0,0 +1,702 @@ +# Docker Deployment + +Crawl4AI provides official Docker images for easy deployment and scalability. This guide covers installation, configuration, and usage of Crawl4AI in Docker environments. + +## Quick Start 🚀 + +Pull and run the basic version: + +```bash +# Basic run without security +docker pull unclecode/crawl4ai:basic +docker run -p 11235:11235 unclecode/crawl4ai:basic + +# Run with API security enabled +docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic +``` + +## Running with Docker Compose 🐳 + +### Use Docker Compose (From Local Dockerfile or Docker Hub) + +Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub. + +### **Option 1: Using Docker Compose to Build Locally** +If you want to build the image locally, use the provided `docker-compose.local.yml` file. + +```bash +docker-compose -f docker-compose.local.yml up -d +``` + +This will: +1. Build the Docker image from the provided `Dockerfile`. +2. Start the container and expose it on `http://localhost:11235`. + +--- + +### **Option 2: Using Docker Compose with Pre-Built Image from Hub** +If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file. + +```bash +docker-compose -f docker-compose.hub.yml up -d +``` + +This will: +1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration). +2. Start the container and expose it on `http://localhost:11235`. + +--- + +### **Stopping the Running Services** + +To stop the services started via Docker Compose, you can use: + +```bash +docker-compose -f docker-compose.local.yml down +# OR +docker-compose -f docker-compose.hub.yml down +``` + +If the containers don’t stop and the application is still running, check the running containers: + +```bash +docker ps +``` + +Find the `CONTAINER ID` of the running service and stop it forcefully: + +```bash +docker stop +``` + +--- + +### **Debugging with Docker Compose** + +- **Check Logs**: To view the container logs: + ```bash + docker-compose -f docker-compose.local.yml logs -f + ``` + +- **Remove Orphaned Containers**: If the service is still running unexpectedly: + ```bash + docker-compose -f docker-compose.local.yml down --remove-orphans + ``` + +- **Manually Remove Network**: If the network is still in use: + ```bash + docker network ls + docker network rm crawl4ai_default + ``` + +--- + +### Why Use Docker Compose? + +Docker Compose is the recommended way to deploy Crawl4AI because: +1. It simplifies multi-container setups. +2. Allows you to define environment variables, resources, and ports in a single file. +3. Makes it easier to switch between local development and production-ready images. + +For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent. + + + + +## API Security 🔒 + +### Understanding CRAWL4AI_API_TOKEN + +The `CRAWL4AI_API_TOKEN` provides optional security for your Crawl4AI instance: + +- If `CRAWL4AI_API_TOKEN` is set: All API endpoints (except `/health`) require authentication +- If `CRAWL4AI_API_TOKEN` is not set: The API is publicly accessible + +```bash +# Secured Instance +docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:all + +# Unsecured Instance +docker run -p 11235:11235 unclecode/crawl4ai:all +``` + +### Making API Calls + +For secured instances, include the token in all requests: + +```python +import requests + +# Setup headers if token is being used +api_token = "your_secret_token" # Same token set in CRAWL4AI_API_TOKEN +headers = {"Authorization": f"Bearer {api_token}"} if api_token else {} + +# Making authenticated requests +response = requests.post( + "http://localhost:11235/crawl", + headers=headers, + json={ + "urls": "https://example.com", + "priority": 10 + } +) + +# Checking task status +task_id = response.json()["task_id"] +status = requests.get( + f"http://localhost:11235/task/{task_id}", + headers=headers +) +``` + +### Using with Docker Compose + +In your `docker-compose.yml`: +```yaml +services: + crawl4ai: + image: unclecode/crawl4ai:all + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional + # ... other configuration +``` + +Then either: +1. Set in `.env` file: +```env +CRAWL4AI_API_TOKEN=your_secret_token +``` + +2. Or set via command line: +```bash +CRAWL4AI_API_TOKEN=your_secret_token docker-compose up +``` + +> **Security Note**: If you enable the API token, make sure to keep it secure and never commit it to version control. The token will be required for all API endpoints except the health check endpoint (`/health`). + +## Configuration Options 🔧 + +### Environment Variables + +You can configure the service using environment variables: + +```bash +# Basic configuration +docker run -p 11235:11235 \ + -e MAX_CONCURRENT_TASKS=5 \ + unclecode/crawl4ai:all + +# With security and LLM support +docker run -p 11235:11235 \ + -e CRAWL4AI_API_TOKEN=your_secret_token \ + -e OPENAI_API_KEY=sk-... \ + -e ANTHROPIC_API_KEY=sk-ant-... \ + unclecode/crawl4ai:all +``` + +### Using Docker Compose (Recommended) 🐳 + +Create a `docker-compose.yml`: + +```yaml +version: '3.8' + +services: + crawl4ai: + image: unclecode/crawl4ai:all + ports: + - "11235:11235" + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API security + - MAX_CONCURRENT_TASKS=5 + # LLM Provider Keys + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} + volumes: + - /dev/shm:/dev/shm + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G +``` + +You can run it in two ways: + +1. Using environment variables directly: +```bash +CRAWL4AI_API_TOKEN=secret123 OPENAI_API_KEY=sk-... docker-compose up +``` + +2. Using a `.env` file (recommended): +Create a `.env` file in the same directory: +```env +# API Security (optional) +CRAWL4AI_API_TOKEN=your_secret_token + +# LLM Provider Keys +OPENAI_API_KEY=sk-... +ANTHROPIC_API_KEY=sk-ant-... + +# Other Configuration +MAX_CONCURRENT_TASKS=5 +``` + +Then simply run: +```bash +docker-compose up +``` + +### Testing the Deployment 🧪 + +```python +import requests + +# For unsecured instances +def test_unsecured(): + # Health check + health = requests.get("http://localhost:11235/health") + print("Health check:", health.json()) + + # Basic crawl + response = requests.post( + "http://localhost:11235/crawl", + json={ + "urls": "https://www.nbcnews.com/business", + "priority": 10 + } + ) + task_id = response.json()["task_id"] + print("Task ID:", task_id) + +# For secured instances +def test_secured(api_token): + headers = {"Authorization": f"Bearer {api_token}"} + + # Basic crawl with authentication + response = requests.post( + "http://localhost:11235/crawl", + headers=headers, + json={ + "urls": "https://www.nbcnews.com/business", + "priority": 10 + } + ) + task_id = response.json()["task_id"] + print("Task ID:", task_id) +``` + +### LLM Extraction Example 🤖 + +When you've configured your LLM provider keys (via environment variables or `.env`), you can use LLM extraction: + +```python +request = { + "urls": "https://example.com", + "extraction_config": { + "type": "llm", + "params": { + "provider": "openai/gpt-4", + "instruction": "Extract main topics from the page" + } + } +} + +# Make the request (add headers if using API security) +response = requests.post("http://localhost:11235/crawl", json=request) +``` + +> **Note**: Remember to add `.env` to your `.gitignore` to keep your API keys secure! + + +## Usage Examples 📝 + +### Basic Crawling + +```python +request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10 +} + +response = requests.post("http://localhost:11235/crawl", json=request) +task_id = response.json()["task_id"] + +# Get results +result = requests.get(f"http://localhost:11235/task/{task_id}") +``` + +### Structured Data Extraction + +```python +schema = { + "name": "Crypto Prices", + "baseSelector": ".cds-tableRow-t45thuk", + "fields": [ + { + "name": "crypto", + "selector": "td:nth-child(1) h2", + "type": "text", + }, + { + "name": "price", + "selector": "td:nth-child(2)", + "type": "text", + } + ], +} + +request = { + "urls": "https://www.coinbase.com/explore", + "extraction_config": { + "type": "json_css", + "params": {"schema": schema} + } +} +``` + +### Dynamic Content Handling + +```python +request = { + "urls": "https://www.nbcnews.com/business", + "js_code": [ + "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" + ], + "wait_for": "article.tease-card:nth-child(10)" +} +``` + +### AI-Powered Extraction (Full Version) + +```python +request = { + "urls": "https://www.nbcnews.com/business", + "extraction_config": { + "type": "cosine", + "params": { + "semantic_filter": "business finance economy", + "word_count_threshold": 10, + "max_dist": 0.2, + "top_k": 3 + } + } +} +``` + +## Platform-Specific Instructions 💻 + +### macOS +```bash +docker pull unclecode/crawl4ai:basic +docker run -p 11235:11235 unclecode/crawl4ai:basic +``` + +### Ubuntu +```bash +# Basic version +docker pull unclecode/crawl4ai:basic +docker run -p 11235:11235 unclecode/crawl4ai:basic + +# With GPU support +docker pull unclecode/crawl4ai:gpu +docker run --gpus all -p 11235:11235 unclecode/crawl4ai:gpu +``` + +### Windows (PowerShell) +```powershell +docker pull unclecode/crawl4ai:basic +docker run -p 11235:11235 unclecode/crawl4ai:basic +``` + +## Testing 🧪 + +Save this as `test_docker.py`: + +```python +import requests +import json +import time +import sys + +class Crawl4AiTester: + def __init__(self, base_url: str = "http://localhost:11235"): + self.base_url = base_url + + def submit_and_wait(self, request_data: dict, timeout: int = 300) -> dict: + # Submit crawl job + response = requests.post(f"{self.base_url}/crawl", json=request_data) + task_id = response.json()["task_id"] + print(f"Task ID: {task_id}") + + # Poll for result + start_time = time.time() + while True: + if time.time() - start_time > timeout: + raise TimeoutError(f"Task {task_id} timeout") + + result = requests.get(f"{self.base_url}/task/{task_id}") + status = result.json() + + if status["status"] == "completed": + return status + + time.sleep(2) + +def test_deployment(): + tester = Crawl4AiTester() + + # Test basic crawl + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10 + } + + result = tester.submit_and_wait(request) + print("Basic crawl successful!") + print(f"Content length: {len(result['result']['markdown'])}") + +if __name__ == "__main__": + test_deployment() +``` + +## Advanced Configuration ⚙️ + +### Crawler Parameters + +The `crawler_params` field allows you to configure the browser instance and crawling behavior. Here are key parameters you can use: + +```python +request = { + "urls": "https://example.com", + "crawler_params": { + # Browser Configuration + "headless": True, # Run in headless mode + "browser_type": "chromium", # chromium/firefox/webkit + "user_agent": "custom-agent", # Custom user agent + "proxy": "http://proxy:8080", # Proxy configuration + + # Performance & Behavior + "page_timeout": 30000, # Page load timeout (ms) + "verbose": True, # Enable detailed logging + "semaphore_count": 5, # Concurrent request limit + + # Anti-Detection Features + "simulate_user": True, # Simulate human behavior + "magic": True, # Advanced anti-detection + "override_navigator": True, # Override navigator properties + + # Session Management + "user_data_dir": "./browser-data", # Browser profile location + "use_managed_browser": True, # Use persistent browser + } +} +``` + +### Extra Parameters + +The `extra` field allows passing additional parameters directly to the crawler's `arun` function: + +```python +request = { + "urls": "https://example.com", + "extra": { + "word_count_threshold": 10, # Min words per block + "only_text": True, # Extract only text + "bypass_cache": True, # Force fresh crawl + "process_iframes": True, # Include iframe content + } +} +``` + +### Complete Examples + +1. **Advanced News Crawling** +```python +request = { + "urls": "https://www.nbcnews.com/business", + "crawler_params": { + "headless": True, + "page_timeout": 30000, + "remove_overlay_elements": True # Remove popups + }, + "extra": { + "word_count_threshold": 50, # Longer content blocks + "bypass_cache": True # Fresh content + }, + "css_selector": ".article-body" +} +``` + +2. **Anti-Detection Configuration** +```python +request = { + "urls": "https://example.com", + "crawler_params": { + "simulate_user": True, + "magic": True, + "override_navigator": True, + "user_agent": "Mozilla/5.0 ...", + "headers": { + "Accept-Language": "en-US,en;q=0.9" + } + } +} +``` + +3. **LLM Extraction with Custom Parameters** +```python +request = { + "urls": "https://openai.com/pricing", + "extraction_config": { + "type": "llm", + "params": { + "provider": "openai/gpt-4", + "schema": pricing_schema + } + }, + "crawler_params": { + "verbose": True, + "page_timeout": 60000 + }, + "extra": { + "word_count_threshold": 1, + "only_text": True + } +} +``` + +4. **Session-Based Dynamic Content** +```python +request = { + "urls": "https://example.com", + "crawler_params": { + "session_id": "dynamic_session", + "headless": False, + "page_timeout": 60000 + }, + "js_code": ["window.scrollTo(0, document.body.scrollHeight);"], + "wait_for": "js:() => document.querySelectorAll('.item').length > 10", + "extra": { + "delay_before_return_html": 2.0 + } +} +``` + +5. **Screenshot with Custom Timing** +```python +request = { + "urls": "https://example.com", + "screenshot": True, + "crawler_params": { + "headless": True, + "screenshot_wait_for": ".main-content" + }, + "extra": { + "delay_before_return_html": 3.0 + } +} +``` + +### Parameter Reference Table + +| Category | Parameter | Type | Description | +|----------|-----------|------|-------------| +| Browser | headless | bool | Run browser in headless mode | +| Browser | browser_type | str | Browser engine selection | +| Browser | user_agent | str | Custom user agent string | +| Network | proxy | str | Proxy server URL | +| Network | headers | dict | Custom HTTP headers | +| Timing | page_timeout | int | Page load timeout (ms) | +| Timing | delay_before_return_html | float | Wait before capture | +| Anti-Detection | simulate_user | bool | Human behavior simulation | +| Anti-Detection | magic | bool | Advanced protection | +| Session | session_id | str | Browser session ID | +| Session | user_data_dir | str | Profile directory | +| Content | word_count_threshold | int | Minimum words per block | +| Content | only_text | bool | Text-only extraction | +| Content | process_iframes | bool | Include iframe content | +| Debug | verbose | bool | Detailed logging | +| Debug | log_console | bool | Browser console logs | + +## Troubleshooting 🔍 + +### Common Issues + +1. **Connection Refused** + ``` + Error: Connection refused at localhost:11235 + ``` + Solution: Ensure the container is running and ports are properly mapped. + +2. **Resource Limits** + ``` + Error: No available slots + ``` + Solution: Increase MAX_CONCURRENT_TASKS or container resources. + +3. **GPU Access** + ``` + Error: GPU not found + ``` + Solution: Ensure proper NVIDIA drivers and use `--gpus all` flag. + +### Debug Mode + +Access container for debugging: +```bash +docker run -it --entrypoint /bin/bash unclecode/crawl4ai:all +``` + +View container logs: +```bash +docker logs [container_id] +``` + +## Best Practices 🌟 + +1. **Resource Management** + - Set appropriate memory and CPU limits + - Monitor resource usage via health endpoint + - Use basic version for simple crawling tasks + +2. **Scaling** + - Use multiple containers for high load + - Implement proper load balancing + - Monitor performance metrics + +3. **Security** + - Use environment variables for sensitive data + - Implement proper network isolation + - Regular security updates + +## API Reference 📚 + +### Health Check +```http +GET /health +``` + +### Submit Crawl Task +```http +POST /crawl +Content-Type: application/json + +{ + "urls": "string or array", + "extraction_config": { + "type": "basic|llm|cosine|json_css", + "params": {} + }, + "priority": 1-10, + "ttl": 3600 +} +``` + +### Get Task Status +```http +GET /task/{task_id} +``` + +For more details, visit the [official documentation](https://crawl4ai.com/mkdocs/). \ No newline at end of file diff --git a/docs/md_v2/basic/file-download.md b/docs/md_v2/basic/file-download.md new file mode 100644 index 0000000000000000000000000000000000000000..eac0f5cb6534f91a737d10cff9004b989c7ba75c --- /dev/null +++ b/docs/md_v2/basic/file-download.md @@ -0,0 +1,129 @@ +# Download Handling in Crawl4AI + +This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files. + +## Enabling Downloads + +To enable downloads, set the `accept_downloads` parameter in the `BrowserConfig` object and pass it to the crawler. + +```python +from crawl4ai.async_configs import BrowserConfig, AsyncWebCrawler + +async def main(): + config = BrowserConfig(accept_downloads=True) # Enable downloads globally + async with AsyncWebCrawler(config=config) as crawler: + # ... your crawling logic ... + +asyncio.run(main()) +``` + +Or, enable it for a specific crawl by using `CrawlerRunConfig`: + +```python +from crawl4ai.async_configs import CrawlerRunConfig + +async def main(): + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig(accept_downloads=True) + result = await crawler.arun(url="https://example.com", config=config) + # ... +``` + +## Specifying Download Location + +Specify the download directory using the `downloads_path` attribute in the `BrowserConfig` object. If not provided, Crawl4AI defaults to creating a "downloads" directory inside the `.crawl4ai` folder in your home directory. + +```python +from crawl4ai.async_configs import BrowserConfig +import os + +downloads_path = os.path.join(os.getcwd(), "my_downloads") # Custom download path +os.makedirs(downloads_path, exist_ok=True) + +config = BrowserConfig(accept_downloads=True, downloads_path=downloads_path) + +async def main(): + async with AsyncWebCrawler(config=config) as crawler: + result = await crawler.arun(url="https://example.com") + # ... +``` + +## Triggering Downloads + +Downloads are typically triggered by user interactions on a web page, such as clicking a download button. Use `js_code` in `CrawlerRunConfig` to simulate these actions and `wait_for` to allow sufficient time for downloads to start. + +```python +from crawl4ai.async_configs import CrawlerRunConfig + +config = CrawlerRunConfig( + js_code=""" + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) { + downloadLink.click(); + } + """, + wait_for=5 # Wait 5 seconds for the download to start +) + +result = await crawler.arun(url="https://www.python.org/downloads/", config=config) +``` + +## Accessing Downloaded Files + +The `downloaded_files` attribute of the `CrawlResult` object contains paths to downloaded files. + +```python +if result.downloaded_files: + print("Downloaded files:") + for file_path in result.downloaded_files: + print(f"- {file_path}") + file_size = os.path.getsize(file_path) + print(f"- File size: {file_size} bytes") +else: + print("No files downloaded.") +``` + +## Example: Downloading Multiple Files + +```python +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +import os +from pathlib import Path + +async def download_multiple_files(url: str, download_path: str): + config = BrowserConfig(accept_downloads=True, downloads_path=download_path) + async with AsyncWebCrawler(config=config) as crawler: + run_config = CrawlerRunConfig( + js_code=""" + const downloadLinks = document.querySelectorAll('a[download]'); + for (const link of downloadLinks) { + link.click(); + await new Promise(r => setTimeout(r, 2000)); // Delay between clicks + } + """, + wait_for=10 # Wait for all downloads to start + ) + result = await crawler.arun(url=url, config=run_config) + + if result.downloaded_files: + print("Downloaded files:") + for file in result.downloaded_files: + print(f"- {file}") + else: + print("No files downloaded.") + +# Usage +download_path = os.path.join(Path.home(), ".crawl4ai", "downloads") +os.makedirs(download_path, exist_ok=True) + +asyncio.run(download_multiple_files("https://www.python.org/downloads/windows/", download_path)) +``` + +## Important Considerations + +- **Browser Context:** Downloads are managed within the browser context. Ensure `js_code` correctly targets the download triggers on the webpage. +- **Timing:** Use `wait_for` in `CrawlerRunConfig` to manage download timing. +- **Error Handling:** Handle errors to manage failed downloads or incorrect paths gracefully. +- **Security:** Scan downloaded files for potential security threats before use. + +This revised guide ensures consistency with the `Crawl4AI` codebase by using `BrowserConfig` and `CrawlerRunConfig` for all download-related configurations. Let me know if further adjustments are needed! \ No newline at end of file diff --git a/docs/md_v2/basic/installation.md b/docs/md_v2/basic/installation.md new file mode 100644 index 0000000000000000000000000000000000000000..de8aeafa501af77ca78da5e47017e32a771bd2e7 --- /dev/null +++ b/docs/md_v2/basic/installation.md @@ -0,0 +1,137 @@ +# Installation 💻 + +Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package, use it with Docker, or run it as a local server. + +## Option 1: Python Package Installation (Recommended) + +Crawl4AI is now available on PyPI, making installation easier than ever. Choose the option that best fits your needs: + +### Basic Installation + +For basic web crawling and scraping tasks: + +```bash +pip install crawl4ai +playwright install # Install Playwright dependencies +``` + +### Installation with PyTorch + +For advanced text clustering (includes CosineSimilarity cluster strategy): + +```bash +pip install crawl4ai[torch] +``` + +### Installation with Transformers + +For text summarization and Hugging Face models: + +```bash +pip install crawl4ai[transformer] +``` + +### Full Installation + +For all features: + +```bash +pip install crawl4ai[all] +``` + +### Development Installation + +For contributors who plan to modify the source code: + +```bash +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai +pip install -e ".[all]" +playwright install # Install Playwright dependencies +``` + +💡 After installation with "torch", "transformer", or "all" options, it's recommended to run the following CLI command to load the required models: + +```bash +crawl4ai-download-models +``` + +This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation. + +## Playwright Installation Note for Ubuntu + +If you encounter issues with Playwright installation on Ubuntu, you may need to install additional dependencies: + +```bash +sudo apt-get install -y \ + libwoff1 \ + libopus0 \ + libwebp7 \ + libwebpdemux2 \ + libenchant-2-2 \ + libgudev-1.0-0 \ + libsecret-1-0 \ + libhyphen0 \ + libgdk-pixbuf2.0-0 \ + libegl1 \ + libnotify4 \ + libxslt1.1 \ + libevent-2.1-7 \ + libgles2 \ + libxcomposite1 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libepoxy0 \ + libgtk-3-0 \ + libharfbuzz-icu0 \ + libgstreamer-gl1.0-0 \ + libgstreamer-plugins-bad1.0-0 \ + gstreamer1.0-plugins-good \ + gstreamer1.0-plugins-bad \ + libxt6 \ + libxaw7 \ + xvfb \ + fonts-noto-color-emoji \ + libfontconfig \ + libfreetype6 \ + xfonts-cyrillic \ + xfonts-scalable \ + fonts-liberation \ + fonts-ipafont-gothic \ + fonts-wqy-zenhei \ + fonts-tlwg-loma-otf \ + fonts-freefont-ttf +``` + +## Option 2: Using Docker (Coming Soon) + +Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems. + +## Option 3: Local Server Installation + +For those who prefer to run Crawl4AI as a local server, instructions will be provided once the Docker implementation is complete. + +## Verifying Your Installation + +After installation, you can verify that Crawl4AI is working correctly by running a simple Python script: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url="https://www.example.com") + print(result.markdown[:500]) # Print first 500 characters + +if __name__ == "__main__": + asyncio.run(main()) +``` + +This script should successfully crawl the example website and print the first 500 characters of the extracted content. + +## Getting Help + +If you encounter any issues during installation or usage, please check the [documentation](https://crawl4ai.com/mkdocs/) or raise an issue on the [GitHub repository](https://github.com/unclecode/crawl4ai/issues). + +Happy crawling! 🕷️🤖 \ No newline at end of file diff --git a/docs/md_v2/basic/output-formats.md b/docs/md_v2/basic/output-formats.md new file mode 100644 index 0000000000000000000000000000000000000000..3686c23cacd59b0a3040aa3f0ea8adbfa1df32b4 --- /dev/null +++ b/docs/md_v2/basic/output-formats.md @@ -0,0 +1,102 @@ +# Output Formats + +Crawl4AI provides multiple output formats to suit different needs, ranging from raw HTML to structured data using LLM or pattern-based extraction, and versatile markdown outputs. + +## Basic Formats + +```python +result = await crawler.arun(url="https://example.com") + +# Access different formats +raw_html = result.html # Original HTML +clean_html = result.cleaned_html # Sanitized HTML +markdown_v2 = result.markdown_v2 # Detailed markdown generation results +fit_md = result.markdown_v2.fit_markdown # Most relevant content in markdown +``` + +> **Note**: The `markdown_v2` property will soon be replaced by `markdown`. It is recommended to start transitioning to using `markdown` for new implementations. + +## Raw HTML + +Original, unmodified HTML from the webpage. Useful when you need to: +- Preserve the exact page structure. +- Process HTML with your own tools. +- Debug page issues. + +```python +result = await crawler.arun(url="https://example.com") +print(result.html) # Complete HTML including headers, scripts, etc. +``` + +## Cleaned HTML + +Sanitized HTML with unnecessary elements removed. Automatically: +- Removes scripts and styles. +- Cleans up formatting. +- Preserves semantic structure. + +```python +config = CrawlerRunConfig( + excluded_tags=['form', 'header', 'footer'], # Additional tags to remove + keep_data_attributes=False # Remove data-* attributes +) +result = await crawler.arun(url="https://example.com", config=config) +print(result.cleaned_html) +``` + +## Standard Markdown + +HTML converted to clean markdown format. This output is useful for: +- Content analysis. +- Documentation. +- Readability. + +```python +config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + options={"include_links": True} # Include links in markdown + ) +) +result = await crawler.arun(url="https://example.com", config=config) +print(result.markdown_v2.raw_markdown) # Standard markdown with links +``` + +## Fit Markdown + +Extract and convert only the most relevant content into markdown format. Best suited for: +- Article extraction. +- Focusing on the main content. +- Removing boilerplate. + +To generate `fit_markdown`, use a content filter like `PruningContentFilter`: + +```python +from crawl4ai.content_filter_strategy import PruningContentFilter + +config = CrawlerRunConfig( + content_filter=PruningContentFilter( + threshold=0.7, + threshold_type="dynamic", + min_word_threshold=100 + ) +) +result = await crawler.arun(url="https://example.com", config=config) +print(result.markdown_v2.fit_markdown) # Extracted main content in markdown +``` + +## Markdown with Citations + +Generate markdown that includes citations for links. This format is ideal for: +- Creating structured documentation. +- Including references for extracted content. + +```python +config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + options={"citations": True} # Enable citations + ) +) +result = await crawler.arun(url="https://example.com", config=config) +print(result.markdown_v2.markdown_with_citations) +print(result.markdown_v2.references_markdown) # Citations section +``` diff --git a/docs/md_v2/basic/page-interaction.md b/docs/md_v2/basic/page-interaction.md new file mode 100644 index 0000000000000000000000000000000000000000..07a2c9cd6858385c36da633618a317209bd0903b --- /dev/null +++ b/docs/md_v2/basic/page-interaction.md @@ -0,0 +1,190 @@ +# Page Interaction + +Crawl4AI provides powerful features for interacting with dynamic webpages, handling JavaScript execution, and managing page events. + +## JavaScript Execution + +### Basic Execution + +```python +from crawl4ai.async_configs import CrawlerRunConfig + +# Single JavaScript command +config = CrawlerRunConfig( + js_code="window.scrollTo(0, document.body.scrollHeight);" +) +result = await crawler.arun(url="https://example.com", config=config) + +# Multiple commands +js_commands = [ + "window.scrollTo(0, document.body.scrollHeight);", + "document.querySelector('.load-more').click();", + "document.querySelector('#consent-button').click();" +] +config = CrawlerRunConfig(js_code=js_commands) +result = await crawler.arun(url="https://example.com", config=config) +``` + +## Wait Conditions + +### CSS-Based Waiting + +Wait for elements to appear: + +```python +config = CrawlerRunConfig(wait_for="css:.dynamic-content") # Wait for element with class 'dynamic-content' +result = await crawler.arun(url="https://example.com", config=config) +``` + +### JavaScript-Based Waiting + +Wait for custom conditions: + +```python +# Wait for number of elements +wait_condition = """() => { + return document.querySelectorAll('.item').length > 10; +}""" + +config = CrawlerRunConfig(wait_for=f"js:{wait_condition}") +result = await crawler.arun(url="https://example.com", config=config) + +# Wait for dynamic content to load +wait_for_content = """() => { + const content = document.querySelector('.content'); + return content && content.innerText.length > 100; +}""" + +config = CrawlerRunConfig(wait_for=f"js:{wait_for_content}") +result = await crawler.arun(url="https://example.com", config=config) +``` + +## Handling Dynamic Content + +### Load More Content + +Handle infinite scroll or load more buttons: + +```python +config = CrawlerRunConfig( + js_code=[ + "window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom + "const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();" # Click load more + ], + wait_for="js:() => document.querySelectorAll('.item').length > previousCount" # Wait for new content +) +result = await crawler.arun(url="https://example.com", config=config) +``` + +### Form Interaction + +Handle forms and inputs: + +```python +js_form_interaction = """ + document.querySelector('#search').value = 'search term'; // Fill form fields + document.querySelector('form').submit(); // Submit form +""" + +config = CrawlerRunConfig( + js_code=js_form_interaction, + wait_for="css:.results" # Wait for results to load +) +result = await crawler.arun(url="https://example.com", config=config) +``` + +## Timing Control + +### Delays and Timeouts + +Control timing of interactions: + +```python +config = CrawlerRunConfig( + page_timeout=60000, # Page load timeout (ms) + delay_before_return_html=2.0 # Wait before capturing content +) +result = await crawler.arun(url="https://example.com", config=config) +``` + +## Complex Interactions Example + +Here's an example of handling a dynamic page with multiple interactions: + +```python +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig + +async def crawl_dynamic_content(): + async with AsyncWebCrawler() as crawler: + # Initial page load + config = CrawlerRunConfig( + js_code="document.querySelector('.cookie-accept')?.click();", # Handle cookie consent + wait_for="css:.main-content" + ) + result = await crawler.arun(url="https://example.com", config=config) + + # Load more content + session_id = "dynamic_session" # Keep session for multiple interactions + + for page in range(3): # Load 3 pages of content + config = CrawlerRunConfig( + session_id=session_id, + js_code=[ + "window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom + "window.previousCount = document.querySelectorAll('.item').length;", # Store item count + "document.querySelector('.load-more')?.click();" # Click load more + ], + wait_for="""() => { + const currentCount = document.querySelectorAll('.item').length; + return currentCount > window.previousCount; + }""", + js_only=(page > 0) # Execute JS without reloading page for subsequent interactions + ) + result = await crawler.arun(url="https://example.com", config=config) + print(f"Page {page + 1} items:", len(result.cleaned_html)) + + # Clean up session + await crawler.crawler_strategy.kill_session(session_id) +``` + +## Using with Extraction Strategies + +Combine page interaction with structured extraction: + +```python +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy +from crawl4ai.async_configs import CrawlerRunConfig + +# Pattern-based extraction after interaction +schema = { + "name": "Dynamic Items", + "baseSelector": ".item", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "description", "selector": ".desc", "type": "text"} + ] +} + +config = CrawlerRunConfig( + js_code="window.scrollTo(0, document.body.scrollHeight);", + wait_for="css:.item:nth-child(10)", # Wait for 10 items + extraction_strategy=JsonCssExtractionStrategy(schema) +) +result = await crawler.arun(url="https://example.com", config=config) + +# Or use LLM to analyze dynamic content +class ContentAnalysis(BaseModel): + topics: List[str] + summary: str + +config = CrawlerRunConfig( + js_code="document.querySelector('.show-more').click();", + wait_for="css:.full-content", + extraction_strategy=LLMExtractionStrategy( + provider="ollama/nemotron", + schema=ContentAnalysis.schema(), + instruction="Analyze the full content" + ) +) +result = await crawler.arun(url="https://example.com", config=config) +``` diff --git a/docs/md_v2/basic/prefix-based-input.md b/docs/md_v2/basic/prefix-based-input.md new file mode 100644 index 0000000000000000000000000000000000000000..6dfae9d45eeed59748fcc8c62844f55eba70ab75 --- /dev/null +++ b/docs/md_v2/basic/prefix-based-input.md @@ -0,0 +1,158 @@ +# Prefix-Based Input Handling in Crawl4AI + +This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example. + +## Crawling a Web URL + +To crawl a live web page, provide the URL starting with `http://` or `https://`, using a `CrawlerRunConfig` object: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import CrawlerRunConfig + +async def crawl_web(): + config = CrawlerRunConfig(bypass_cache=True) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://en.wikipedia.org/wiki/apple", config=config) + if result.success: + print("Markdown Content:") + print(result.markdown) + else: + print(f"Failed to crawl: {result.error_message}") + +asyncio.run(crawl_web()) +``` + +## Crawling a Local HTML File + +To crawl a local HTML file, prefix the file path with `file://`. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import CrawlerRunConfig + +async def crawl_local_file(): + local_file_path = "/path/to/apple.html" # Replace with your file path + file_url = f"file://{local_file_path}" + config = CrawlerRunConfig(bypass_cache=True) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url=file_url, config=config) + if result.success: + print("Markdown Content from Local File:") + print(result.markdown) + else: + print(f"Failed to crawl local file: {result.error_message}") + +asyncio.run(crawl_local_file()) +``` + +## Crawling Raw HTML Content + +To crawl raw HTML content, prefix the HTML string with `raw:`. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import CrawlerRunConfig + +async def crawl_raw_html(): + raw_html = "

    Hello, World!

    " + raw_html_url = f"raw:{raw_html}" + config = CrawlerRunConfig(bypass_cache=True) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url=raw_html_url, config=config) + if result.success: + print("Markdown Content from Raw HTML:") + print(result.markdown) + else: + print(f"Failed to crawl raw HTML: {result.error_message}") + +asyncio.run(crawl_raw_html()) +``` + +--- + +# Complete Example + +Below is a comprehensive script that: + +1. Crawls the Wikipedia page for "Apple." +2. Saves the HTML content to a local file (`apple.html`). +3. Crawls the local HTML file and verifies the markdown length matches the original crawl. +4. Crawls the raw HTML content from the saved file and verifies consistency. + +```python +import os +import sys +import asyncio +from pathlib import Path +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import CrawlerRunConfig + +async def main(): + wikipedia_url = "https://en.wikipedia.org/wiki/apple" + script_dir = Path(__file__).parent + html_file_path = script_dir / "apple.html" + + async with AsyncWebCrawler() as crawler: + # Step 1: Crawl the Web URL + print("\n=== Step 1: Crawling the Wikipedia URL ===") + web_config = CrawlerRunConfig(bypass_cache=True) + result = await crawler.arun(url=wikipedia_url, config=web_config) + + if not result.success: + print(f"Failed to crawl {wikipedia_url}: {result.error_message}") + return + + with open(html_file_path, 'w', encoding='utf-8') as f: + f.write(result.html) + web_crawl_length = len(result.markdown) + print(f"Length of markdown from web crawl: {web_crawl_length}\n") + + # Step 2: Crawl from the Local HTML File + print("=== Step 2: Crawling from the Local HTML File ===") + file_url = f"file://{html_file_path.resolve()}" + file_config = CrawlerRunConfig(bypass_cache=True) + local_result = await crawler.arun(url=file_url, config=file_config) + + if not local_result.success: + print(f"Failed to crawl local file {file_url}: {local_result.error_message}") + return + + local_crawl_length = len(local_result.markdown) + assert web_crawl_length == local_crawl_length, "Markdown length mismatch" + print("✅ Markdown length matches between web and local file crawl.\n") + + # Step 3: Crawl Using Raw HTML Content + print("=== Step 3: Crawling Using Raw HTML Content ===") + with open(html_file_path, 'r', encoding='utf-8') as f: + raw_html_content = f.read() + raw_html_url = f"raw:{raw_html_content}" + raw_config = CrawlerRunConfig(bypass_cache=True) + raw_result = await crawler.arun(url=raw_html_url, config=raw_config) + + if not raw_result.success: + print(f"Failed to crawl raw HTML content: {raw_result.error_message}") + return + + raw_crawl_length = len(raw_result.markdown) + assert web_crawl_length == raw_crawl_length, "Markdown length mismatch" + print("✅ Markdown length matches between web and raw HTML crawl.\n") + + print("All tests passed successfully!") + if html_file_path.exists(): + os.remove(html_file_path) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +# Conclusion + +With the unified `url` parameter and prefix-based handling in **Crawl4AI**, you can seamlessly handle web URLs, local HTML files, and raw HTML content. Use `CrawlerRunConfig` for flexible and consistent configuration in all scenarios. \ No newline at end of file diff --git a/docs/md_v2/basic/quickstart.md b/docs/md_v2/basic/quickstart.md new file mode 100644 index 0000000000000000000000000000000000000000..ffc35986cc36f7a5d3cb1b823d5eeca193085cbd --- /dev/null +++ b/docs/md_v2/basic/quickstart.md @@ -0,0 +1,172 @@ +# Quick Start Guide 🚀 + +Welcome to the Crawl4AI Quickstart Guide! In this tutorial, we'll walk you through the basic usage of Crawl4AI, covering everything from initial setup to advanced features like chunking and extraction strategies, using asynchronous programming. Let's dive in! 🌟 + +--- + +## Getting Started 🛠️ + +Set up your environment with `BrowserConfig` and create an `AsyncWebCrawler` instance. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import BrowserConfig + +async def main(): + browser_config = BrowserConfig(verbose=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + # Add your crawling logic here + pass + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +### Basic Usage + +Provide a URL and let Crawl4AI do the work! + +```python +from crawl4ai.async_configs import CrawlerRunConfig + +async def main(): + browser_config = BrowserConfig(verbose=True) + crawl_config = CrawlerRunConfig(url="https://www.nbcnews.com/business") + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(config=crawl_config) + print(f"Basic crawl result: {result.markdown[:500]}") # Print first 500 characters + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +### Taking Screenshots 📸 + +Capture and save webpage screenshots with `CrawlerRunConfig`: + +```python +from crawl4ai.async_configs import CacheMode + +async def capture_and_save_screenshot(url: str, output_path: str): + browser_config = BrowserConfig(verbose=True) + crawl_config = CrawlerRunConfig( + url=url, + screenshot=True, + cache_mode=CacheMode.BYPASS + ) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(config=crawl_config) + + if result.success and result.screenshot: + import base64 + screenshot_data = base64.b64decode(result.screenshot) + with open(output_path, 'wb') as f: + f.write(screenshot_data) + print(f"Screenshot saved successfully to {output_path}") + else: + print("Failed to capture screenshot") +``` + +--- + +### Browser Selection 🌐 + +Choose from multiple browser engines using `BrowserConfig`: + +```python +from crawl4ai.async_configs import BrowserConfig + +# Use Firefox +firefox_config = BrowserConfig(browser_type="firefox", verbose=True, headless=True) +async with AsyncWebCrawler(config=firefox_config) as crawler: + result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com")) + +# Use WebKit +webkit_config = BrowserConfig(browser_type="webkit", verbose=True, headless=True) +async with AsyncWebCrawler(config=webkit_config) as crawler: + result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com")) + +# Use Chromium (default) +chromium_config = BrowserConfig(verbose=True, headless=True) +async with AsyncWebCrawler(config=chromium_config) as crawler: + result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com")) +``` + +--- + +### User Simulation 🎭 + +Simulate real user behavior to bypass detection: + +```python +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig + +browser_config = BrowserConfig(verbose=True, headless=True) +crawl_config = CrawlerRunConfig( + url="YOUR-URL-HERE", + cache_mode=CacheMode.BYPASS, + simulate_user=True, # Random mouse movements and clicks + override_navigator=True # Makes the browser appear like a real user +) +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(config=crawl_config) +``` + +--- + +### Understanding Parameters 🧠 + +Explore caching and forcing fresh crawls: + +```python +async def main(): + browser_config = BrowserConfig(verbose=True) + + async with AsyncWebCrawler(config=browser_config) as crawler: + # First crawl (uses cache) + result1 = await crawler.arun(config=CrawlerRunConfig(url="https://www.nbcnews.com/business")) + print(f"First crawl result: {result1.markdown[:100]}...") + + # Force fresh crawl + result2 = await crawler.arun( + config=CrawlerRunConfig(url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS) + ) + print(f"Second crawl result: {result2.markdown[:100]}...") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +### Adding a Chunking Strategy 🧩 + +Split content into chunks using `RegexChunking`: + +```python +from crawl4ai.chunking_strategy import RegexChunking + +async def main(): + browser_config = BrowserConfig(verbose=True) + crawl_config = CrawlerRunConfig( + url="https://www.nbcnews.com/business", + chunking_strategy=RegexChunking(patterns=["\n\n"]) + ) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(config=crawl_config) + print(f"RegexChunking result: {result.extracted_content[:200]}...") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +### Advanced Features and Configurations + +For advanced examples (LLM strategies, knowledge graphs, pagination handling), ensure all code aligns with the `BrowserConfig` and `CrawlerRunConfig` pattern shown above. diff --git a/docs/md_v2/basic/simple-crawling.md b/docs/md_v2/basic/simple-crawling.md new file mode 100644 index 0000000000000000000000000000000000000000..ec63984c7fdff61de71e4a92e58a10736136b61e --- /dev/null +++ b/docs/md_v2/basic/simple-crawling.md @@ -0,0 +1,145 @@ +# Simple Crawling + +This guide covers the basics of web crawling with Crawl4AI. You'll learn how to set up a crawler, make your first request, and understand the response. + +## Basic Usage + +Set up a simple crawl using `BrowserConfig` and `CrawlerRunConfig`: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig + +async def main(): + browser_config = BrowserConfig() # Default browser configuration + run_config = CrawlerRunConfig() # Default crawl run configuration + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_config + ) + print(result.markdown) # Print clean markdown content + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Understanding the Response + +The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details): + +```python +result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig(fit_markdown=True) +) + +# Different content formats +print(result.html) # Raw HTML +print(result.cleaned_html) # Cleaned HTML +print(result.markdown) # Markdown version +print(result.fit_markdown) # Most relevant content in markdown + +# Check success status +print(result.success) # True if crawl succeeded +print(result.status_code) # HTTP status code (e.g., 200, 404) + +# Access extracted media and links +print(result.media) # Dictionary of found media (images, videos, audio) +print(result.links) # Dictionary of internal and external links +``` + +## Adding Basic Options + +Customize your crawl using `CrawlerRunConfig`: + +```python +run_config = CrawlerRunConfig( + word_count_threshold=10, # Minimum words per content block + exclude_external_links=True, # Remove external links + remove_overlay_elements=True, # Remove popups/modals + process_iframes=True # Process iframe content +) + +result = await crawler.arun( + url="https://example.com", + config=run_config +) +``` + +## Handling Errors + +Always check if the crawl was successful: + +```python +run_config = CrawlerRunConfig() +result = await crawler.arun(url="https://example.com", config=run_config) + +if not result.success: + print(f"Crawl failed: {result.error_message}") + print(f"Status code: {result.status_code}") +``` + +## Logging and Debugging + +Enable verbose logging in `BrowserConfig`: + +```python +browser_config = BrowserConfig(verbose=True) + +async with AsyncWebCrawler(config=browser_config) as crawler: + run_config = CrawlerRunConfig() + result = await crawler.arun(url="https://example.com", config=run_config) +``` + +## Complete Example + +Here's a more comprehensive example demonstrating common usage patterns: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode + +async def main(): + browser_config = BrowserConfig(verbose=True) + run_config = CrawlerRunConfig( + # Content filtering + word_count_threshold=10, + excluded_tags=['form', 'header'], + exclude_external_links=True, + + # Content processing + process_iframes=True, + remove_overlay_elements=True, + + # Cache control + cache_mode=CacheMode.ENABLED # Use cache if available + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_config + ) + + if result.success: + # Print clean content + print("Content:", result.markdown[:500]) # First 500 chars + + # Process images + for image in result.media["images"]: + print(f"Found image: {image['src']}") + + # Process links + for link in result.links["internal"]: + print(f"Internal link: {link['href']}") + + else: + print(f"Crawl failed: {result.error_message}") + +if __name__ == "__main__": + asyncio.run(main()) +``` diff --git a/docs/md_v2/blog/articles/dockerize_hooks.md b/docs/md_v2/blog/articles/dockerize_hooks.md new file mode 100644 index 0000000000000000000000000000000000000000..965388ee4def925d47aaf2541f2fcea77bcb91ed --- /dev/null +++ b/docs/md_v2/blog/articles/dockerize_hooks.md @@ -0,0 +1,46 @@ +## Introducing Event Streams and Interactive Hooks in Crawl4AI + +![event-driven-crawl](https://res.cloudinary.com/kidocode/image/upload/t_400x400/v1734344008/15bb8bbb-83ac-43ac-962d-3feb3e0c3bbf_2_tjmr4n.webp) + +In the near future, I’m planning to enhance Crawl4AI’s capabilities by introducing an event stream mechanism that will give clients deeper, real-time insights into the crawling process. Today, hooks are a powerful feature at the code level—they let developers define custom logic at key points in the crawl. However, when using Crawl4AI as a service (e.g., through a Dockerized API), there isn’t an easy way to interact with these hooks at runtime. + +**What’s Changing?** + +I’m working on a solution that will allow the crawler to emit a continuous stream of events, updating clients on the current crawling stage, encountered pages, and any decision points. This event stream could be exposed over a standardized protocol like Server-Sent Events (SSE) or WebSockets, enabling clients to “subscribe” and listen as the crawler works. + +**Interactivity Through Process IDs** + +A key part of this new design is the concept of a unique process ID for each crawl session. Imagine you’re listening to an event stream that informs you: +- The crawler just hit a certain page +- It triggered a hook and is now pausing for instructions + +With the event stream in place, you can send a follow-up request back to the server—referencing the unique process ID—to provide extra data, instructions, or parameters. This might include selecting which links to follow next, adjusting extraction strategies, or providing authentication tokens for a protected API. Once the crawler receives these instructions, it resumes execution with the updated context. + +```mermaid +sequenceDiagram + participant Client + participant Server + participant Crawler + + Client->>Server: Start crawl request + Server->>Crawler: Initiate crawl with Process ID + Crawler-->>Server: Event: Page hit + Server-->>Client: Stream: Page hit event + Client->>Server: Instruction for Process ID + Server->>Crawler: Update crawl with new instructions + Crawler-->>Server: Event: Crawl completed + Server-->>Client: Stream: Crawl completed +``` + +**Benefits for Developers and Users** + +1. **Fine-Grained Control**: Instead of predefining all logic upfront, you can dynamically guide the crawler in response to actual data and conditions encountered mid-crawl. +2. **Real-Time Insights**: Monitor progress, errors, or network bottlenecks as they happen, without waiting for the entire crawl to finish. +3. **Enhanced Collaboration**: Different team members or automated systems can watch the same crawl events and provide input, making the crawling process more adaptive and intelligent. + +**Next Steps** + +I’m currently exploring the best APIs, technologies, and patterns to make this vision a reality. My goal is to deliver a seamless developer experience—one that integrates with existing Crawl4AI workflows while offering new flexibility and power. + +Stay tuned for more updates as I continue building this feature out. In the meantime, I’d love to hear any feedback or suggestions you might have to help shape this interactive, event-driven future of web crawling with Crawl4AI. + diff --git a/docs/md_v2/blog/index.md b/docs/md_v2/blog/index.md new file mode 100644 index 0000000000000000000000000000000000000000..f7c8494da226ff85029bf84144d7c6ce4b7e38f6 --- /dev/null +++ b/docs/md_v2/blog/index.md @@ -0,0 +1,47 @@ +# Crawl4AI Blog + +Welcome to the Crawl4AI blog! Here you'll find detailed release notes, technical insights, and updates about the project. Whether you're looking for the latest improvements or want to dive deep into web crawling techniques, this is the place. + +## Latest Release + +### [0.4.2 - Configurable Crawlers, Session Management, and Smarter Screenshots](releases/0.4.2.md) +*December 12, 2024* + +The 0.4.2 update brings massive improvements to configuration, making crawlers and browsers easier to manage with dedicated objects. You can now import/export local storage for seamless session management. Plus, long-page screenshots are faster and cleaner, and full-page PDF exports are now possible. Check out all the new features to make your crawling experience even smoother. + +[Read full release notes →](releases/0.4.2.md) + +--- + +### [0.4.1 - Smarter Crawling with Lazy-Load Handling, Text-Only Mode, and More](releases/0.4.1.md) +*December 8, 2024* + +This release brings major improvements to handling lazy-loaded images, a blazing-fast Text-Only Mode, full-page scanning for infinite scrolls, dynamic viewport adjustments, and session reuse for efficient crawling. If you're looking to improve speed, reliability, or handle dynamic content with ease, this update has you covered. + +[Read full release notes →](releases/0.4.1.md) + +--- + +### [0.4.0 - Major Content Filtering Update](releases/0.4.0.md) +*December 1, 2024* + +Introduced significant improvements to content filtering, multi-threaded environment handling, and user-agent generation. This release features the new PruningContentFilter, enhanced thread safety, and improved test coverage. + +[Read full release notes →](releases/0.4.0.md) + +## Project History + +Curious about how Crawl4AI has evolved? Check out our [complete changelog](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md) for a detailed history of all versions and updates. + +## Categories + +- [Technical Deep Dives](/blog/technical) - Coming soon +- [Tutorials & Guides](/blog/tutorials) - Coming soon +- [Community Updates](/blog/community) - Coming soon + +## Stay Updated + +- Star us on [GitHub](https://github.com/unclecode/crawl4ai) +- Follow [@unclecode](https://twitter.com/unclecode) on Twitter +- Join our community discussions on GitHub + diff --git a/docs/md_v2/blog/releases/0.4.0.md b/docs/md_v2/blog/releases/0.4.0.md new file mode 100644 index 0000000000000000000000000000000000000000..0e7ee5df0e2bbf059cf6f57404a6c83d898df7bc --- /dev/null +++ b/docs/md_v2/blog/releases/0.4.0.md @@ -0,0 +1,62 @@ +# Release Summary for Version 0.4.0 (December 1, 2024) + +## Overview +The 0.4.0 release introduces significant improvements to content filtering, multi-threaded environment handling, user-agent generation, and test coverage. Key highlights include the introduction of the PruningContentFilter, designed to automatically identify and extract the most valuable parts of an HTML document, as well as enhancements to the BM25ContentFilter to extend its versatility and effectiveness. + +## Major Features and Enhancements + +### 1. PruningContentFilter +- Introduced a new unsupervised content filtering strategy that scores and prunes less relevant nodes in an HTML document based on metrics like text and link density. +- Focuses on retaining the most valuable parts of the content, making it highly effective for extracting relevant information from complex web pages. +- Fully documented with updated README and expanded user guides. + +### 2. User-Agent Generator +- Added a user-agent generator utility that resolves compatibility issues and supports customizable user-agent strings. +- By default, the generator randomizes user agents for each request, adding diversity, but users can customize it for tailored scenarios. + +### 3. Enhanced Thread Safety +- Improved handling of multi-threaded environments by adding better thread locks for parallel processing, ensuring consistency and stability when running multiple threads. + +### 4. Extended Content Filtering Strategies +- Users now have access to both the PruningContentFilter for unsupervised extraction and the BM25ContentFilter for supervised filtering based on user queries. +- Enhanced BM25ContentFilter with improved capabilities to process page titles, meta tags, and descriptions, allowing for more effective classification and clustering of text chunks. + +### 5. Documentation Updates +- Updated examples and tutorials to promote the use of the PruningContentFilter alongside the BM25ContentFilter, providing clear instructions for selecting the appropriate filter for each use case. + +### 6. Unit Test Enhancements +- Added unit tests for PruningContentFilter to ensure accuracy and reliability. +- Enhanced BM25ContentFilter tests to cover additional edge cases and performance metrics, particularly for malformed HTML inputs. + +## Revised Change Logs for Version 0.4.0 + +### PruningContentFilter (Dec 01, 2024) +- Introduced the PruningContentFilter to optimize content extraction by pruning less relevant HTML nodes. + - **Affected Files:** + - **crawl4ai/content_filter_strategy.py**: Added a scoring-based pruning algorithm. + - **README.md**: Updated to include PruningContentFilter usage. + - **docs/md_v2/basic/content_filtering.md**: Expanded user documentation, detailing the use and benefits of PruningContentFilter. + +### Unit Tests for PruningContentFilter (Dec 01, 2024) +- Added comprehensive unit tests for PruningContentFilter to ensure correctness and efficiency. + - **Affected Files:** + - **tests/async/test_content_filter_prune.py**: Created tests covering different pruning scenarios to ensure stability and correctness. + +### Enhanced BM25ContentFilter Tests (Dec 01, 2024) +- Expanded tests to cover additional extraction scenarios and performance metrics, improving robustness. + - **Affected Files:** + - **tests/async/test_content_filter_bm25.py**: Added tests for edge cases, including malformed HTML inputs. + +### Documentation and Example Updates (Dec 01, 2024) +- Revised examples to illustrate the use of PruningContentFilter alongside existing content filtering methods. + - **Affected Files:** + - **docs/examples/quickstart_async.py**: Enhanced example clarity and usability for new users. + +## Experimental Features +- The PruningContentFilter is still under experimental development, and we continue to gather feedback for further refinements. + +## Conclusion +This release significantly enhances the content extraction capabilities of Crawl4ai with the introduction of the PruningContentFilter, improved supervised filtering with BM25ContentFilter, and robust multi-threaded handling. Additionally, the user-agent generator provides much-needed versatility, resolving compatibility issues faced by many users. + +Users are encouraged to experiment with the new content filtering methods to determine which best suits their needs. + diff --git a/docs/md_v2/blog/releases/0.4.1.md b/docs/md_v2/blog/releases/0.4.1.md new file mode 100644 index 0000000000000000000000000000000000000000..e770d0b21ade2fd2cfd625b1910deada85425c33 --- /dev/null +++ b/docs/md_v2/blog/releases/0.4.1.md @@ -0,0 +1,145 @@ +# Release Summary for Version 0.4.1 (December 8, 2024): Major Efficiency Boosts with New Features! + +_This post was generated with the help of ChatGPT, take everything with a grain of salt. 🧂_ + +Hi everyone, + +I just finished putting together version 0.4.1 of Crawl4AI, and there are a few changes in here that I think you’ll find really helpful. I’ll explain what’s new, why it matters, and exactly how you can use these features (with the code to back it up). Let’s get into it. + +--- + +### Handling Lazy Loading Better (Images Included) + +One thing that always bugged me with crawlers is how often they miss lazy-loaded content, especially images. In this version, I made sure Crawl4AI **waits for all images to load** before moving forward. This is useful because many modern websites only load images when they’re in the viewport or after some JavaScript executes. + +Here’s how to enable it: + +```python +await crawler.crawl( + url="https://example.com", + wait_for_images=True # Add this argument to ensure images are fully loaded +) +``` + +What this does is: +1. Waits for the page to reach a "network idle" state. +2. Ensures all images on the page have been completely loaded. + +This single change handles the majority of lazy-loading cases you’re likely to encounter. + +--- + +### Text-Only Mode (Fast, Lightweight Crawling) + +Sometimes, you don’t need to download images or process JavaScript at all. For example, if you’re crawling to extract text data, you can enable **text-only mode** to speed things up. By disabling images, JavaScript, and other heavy resources, this mode makes crawling **3-4 times faster** in most cases. + +Here’s how to turn it on: + +```python +crawler = AsyncPlaywrightCrawlerStrategy( + text_mode=True # Set this to True to enable text-only crawling +) +``` + +When `text_mode=True`, the crawler automatically: +- Disables GPU processing. +- Blocks image and JavaScript resources. +- Reduces the viewport size to 800x600 (you can override this with `viewport_width` and `viewport_height`). + +If you need to crawl thousands of pages where you only care about text, this mode will save you a ton of time and resources. + +--- + +### Adjusting the Viewport Dynamically + +Another useful addition is the ability to **dynamically adjust the viewport size** to match the content on the page. This is particularly helpful when you’re working with responsive layouts or want to ensure all parts of the page load properly. + +Here’s how it works: +1. The crawler calculates the page’s width and height after it loads. +2. It adjusts the viewport to fit the content dimensions. +3. (Optional) It uses Chrome DevTools Protocol (CDP) to simulate zooming out so everything fits in the viewport. + +To enable this, use: + +```python +await crawler.crawl( + url="https://example.com", + adjust_viewport_to_content=True # Dynamically adjusts the viewport +) +``` + +This approach makes sure the entire page gets loaded into the viewport, especially for layouts that load content based on visibility. + +--- + +### Simulating Full-Page Scrolling + +Some websites load data dynamically as you scroll down the page. To handle these cases, I added support for **full-page scanning**. It simulates scrolling to the bottom of the page, checking for new content, and capturing it all. + +Here’s an example: + +```python +await crawler.crawl( + url="https://example.com", + scan_full_page=True, # Enables scrolling + scroll_delay=0.2 # Waits 200ms between scrolls (optional) +) +``` + +What happens here: +1. The crawler scrolls down in increments, waiting for content to load after each scroll. +2. It stops when no new content appears (i.e., dynamic elements stop loading). +3. It scrolls back to the top before finishing (if necessary). + +If you’ve ever had to deal with infinite scroll pages, this is going to save you a lot of headaches. + +--- + +### Reusing Browser Sessions (Save Time on Setup) + +By default, every time you crawl a page, a new browser context (or tab) is created. That’s fine for small crawls, but if you’re working on a large dataset, it’s more efficient to reuse the same session. + +I added a method called `create_session` for this: + +```python +session_id = await crawler.create_session() + +# Use the same session for multiple crawls +await crawler.crawl( + url="https://example.com/page1", + session_id=session_id # Reuse the session +) +await crawler.crawl( + url="https://example.com/page2", + session_id=session_id +) +``` + +This avoids creating a new tab for every page, speeding up the crawl and reducing memory usage. + +--- + +### Other Updates + +Here are a few smaller updates I’ve made: +- **Light Mode**: Use `light_mode=True` to disable background processes, extensions, and other unnecessary features, making the browser more efficient. +- **Logging**: Improved logs to make debugging easier. +- **Defaults**: Added sensible defaults for things like `delay_before_return_html` (now set to 0.1 seconds). + +--- + +### How to Get the Update + +You can install or upgrade to version `0.4.1` like this: + +```bash +pip install crawl4ai --upgrade +``` + +As always, I’d love to hear your thoughts. If there’s something you think could be improved or if you have suggestions for future versions, let me know! + +Enjoy the new features, and happy crawling! 🕷️ + +--- + + diff --git a/docs/md_v2/blog/releases/0.4.2.md b/docs/md_v2/blog/releases/0.4.2.md new file mode 100644 index 0000000000000000000000000000000000000000..6f8f39e9f2bf37e9e3086de7a7a7c46acf42181d --- /dev/null +++ b/docs/md_v2/blog/releases/0.4.2.md @@ -0,0 +1,86 @@ +## 🚀 Crawl4AI 0.4.2 Update: Smarter Crawling Just Got Easier (Dec 12, 2024) + +### Hey Developers, + +I’m excited to share Crawl4AI 0.4.2—a major upgrade that makes crawling smarter, faster, and a whole lot more intuitive. I’ve packed in a bunch of new features to simplify your workflows and improve your experience. Let’s cut to the chase! + +--- + +### 🔧 **Configurable Browser and Crawler Behavior** + +You’ve asked for better control over how browsers and crawlers are configured, and now you’ve got it. With the new `BrowserConfig` and `CrawlerRunConfig` objects, you can set up your browser and crawling behavior exactly how you want. No more cluttering `arun` with a dozen arguments—just pass in your configs and go. + +**Example:** +```python +from crawl4ai import BrowserConfig, CrawlerRunConfig, AsyncWebCrawler + +browser_config = BrowserConfig(headless=True, viewport_width=1920, viewport_height=1080) +crawler_config = CrawlerRunConfig(cache_mode="BYPASS") + +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com", config=crawler_config) + print(result.markdown[:500]) +``` + +This setup is a game-changer for scalability, keeping your code clean and flexible as we add more parameters in the future. + +Remember: If you like to use the old way, you can still pass arguments directly to `arun` as before, no worries! + +--- + +### 🔐 **Streamlined Session Management** + +Here’s the big one: You can now pass local storage and cookies directly. Whether it’s setting values programmatically or importing a saved JSON state, managing sessions has never been easier. This is a must-have for authenticated crawls—just export your storage state once and reuse it effortlessly across runs. + +**Example:** +1. Open a browser, log in manually, and export the storage state. +2. Import the JSON file for seamless authenticated crawling: + +```python +result = await crawler.arun( + url="https://example.com/protected", + storage_state="my_storage_state.json" +) +``` + +--- + +### 🔢 **Handling Large Pages: Supercharged Screenshots and PDF Conversion** + +Two big upgrades here: + +- **Blazing-fast long-page screenshots**: Turn extremely long web pages into clean, high-quality screenshots—without breaking a sweat. It’s optimized to handle large content without lag. + +- **Full-page PDF exports**: Now, you can also convert any page into a PDF with all the details intact. Perfect for archiving or sharing complex layouts. + +--- + +### 🔧 **Other Cool Stuff** + +- **Anti-bot enhancements**: Magic mode now handles overlays, user simulation, and anti-detection features like a pro. +- **JavaScript execution**: Execute custom JS snippets to handle dynamic content. No more wrestling with endless page interactions. + +--- + +### 📊 **Performance Boosts and Dev-friendly Updates** + +- Faster rendering and viewport adjustments for better performance. +- Improved cookie and local storage handling for seamless authentication. +- Better debugging with detailed logs and actionable error messages. + +--- + +### 🔠 **Use Cases You’ll Love** + +1. **Authenticated Crawls**: Login once, export your storage state, and reuse it across multiple requests without the headache. +2. **Long-page Screenshots**: Perfect for blogs, e-commerce pages, or any endless-scroll website. +3. **PDF Export**: Create professional-looking page PDFs in seconds. + +--- + +### Let’s Get Crawling + +Crawl4AI 0.4.2 is ready for you to download and try. I’m always looking for ways to improve, so don’t hold back—share your thoughts and feedback. + +Happy Crawling! 🚀 + diff --git a/docs/md_v2/extraction/chunking.md b/docs/md_v2/extraction/chunking.md new file mode 100644 index 0000000000000000000000000000000000000000..f429310f822b63a413b9b36c2db6081f631e04b3 --- /dev/null +++ b/docs/md_v2/extraction/chunking.md @@ -0,0 +1,133 @@ +## Chunking Strategies 📚 + +Crawl4AI provides several powerful chunking strategies to divide text into manageable parts for further processing. Each strategy has unique characteristics and is suitable for different scenarios. Let's explore them one by one. + +### RegexChunking + +`RegexChunking` splits text using regular expressions. This is ideal for creating chunks based on specific patterns like paragraphs or sentences. + +#### When to Use +- Great for structured text with consistent delimiters. +- Suitable for documents where specific patterns (e.g., double newlines, periods) indicate logical chunks. + +#### Parameters +- `patterns` (list, optional): Regular expressions used to split the text. Default is to split by double newlines (`['\n\n']`). + +#### Example +```python +from crawl4ai.chunking_strategy import RegexChunking + +# Define patterns for splitting text +patterns = [r'\n\n', r'\. '] +chunker = RegexChunking(patterns=patterns) + +# Sample text +text = "This is a sample text. It will be split into chunks.\n\nThis is another paragraph." + +# Chunk the text +chunks = chunker.chunk(text) +print(chunks) +``` + +### NlpSentenceChunking + +`NlpSentenceChunking` uses NLP models to split text into sentences, ensuring accurate sentence boundaries. + +#### When to Use +- Ideal for texts where sentence boundaries are crucial. +- Useful for creating chunks that preserve grammatical structures. + +#### Parameters +- None. + +#### Example +```python +from crawl4ai.chunking_strategy import NlpSentenceChunking + +chunker = NlpSentenceChunking() + +# Sample text +text = "This is a sample text. It will be split into sentences. Here's another sentence." + +# Chunk the text +chunks = chunker.chunk(text) +print(chunks) +``` + +### TopicSegmentationChunking + +`TopicSegmentationChunking` employs the TextTiling algorithm to segment text into topic-based chunks. This method identifies thematic boundaries. + +#### When to Use +- Perfect for long documents with distinct topics. +- Useful when preserving topic continuity is more important than maintaining text order. + +#### Parameters +- `num_keywords` (int, optional): Number of keywords for each topic segment. Default is `3`. + +#### Example +```python +from crawl4ai.chunking_strategy import TopicSegmentationChunking + +chunker = TopicSegmentationChunking(num_keywords=3) + +# Sample text +text = "This document contains several topics. Topic one discusses AI. Topic two covers machine learning." + +# Chunk the text +chunks = chunker.chunk(text) +print(chunks) +``` + +### FixedLengthWordChunking + +`FixedLengthWordChunking` splits text into chunks based on a fixed number of words. This ensures each chunk has approximately the same length. + +#### When to Use +- Suitable for processing large texts where uniform chunk size is important. +- Useful when the number of words per chunk needs to be controlled. + +#### Parameters +- `chunk_size` (int, optional): Number of words per chunk. Default is `100`. + +#### Example +```python +from crawl4ai.chunking_strategy import FixedLengthWordChunking + +chunker = FixedLengthWordChunking(chunk_size=10) + +# Sample text +text = "This is a sample text. It will be split into chunks of fixed length." + +# Chunk the text +chunks = chunker.chunk(text) +print(chunks) +``` + +### SlidingWindowChunking + +`SlidingWindowChunking` uses a sliding window approach to create overlapping chunks. Each chunk has a fixed length, and the window slides by a specified step size. + +#### When to Use +- Ideal for creating overlapping chunks to preserve context. +- Useful for tasks where context from adjacent chunks is needed. + +#### Parameters +- `window_size` (int, optional): Number of words in each chunk. Default is `100`. +- `step` (int, optional): Number of words to slide the window. Default is `50`. + +#### Example +```python +from crawl4ai.chunking_strategy import SlidingWindowChunking + +chunker = SlidingWindowChunking(window_size=10, step=5) + +# Sample text +text = "This is a sample text. It will be split using a sliding window approach to preserve context." + +# Chunk the text +chunks = chunker.chunk(text) +print(chunks) +``` + +With these chunking strategies, you can choose the best method to divide your text based on your specific needs. Whether you need precise sentence boundaries, topic-based segmentation, or uniform chunk sizes, Crawl4AI has you covered. Happy chunking! 📝✨ diff --git a/docs/md_v2/extraction/cosine.md b/docs/md_v2/extraction/cosine.md new file mode 100644 index 0000000000000000000000000000000000000000..9ce49e405e8e6bfef820e8a3b7e1dc26ee575150 --- /dev/null +++ b/docs/md_v2/extraction/cosine.md @@ -0,0 +1,222 @@ +# Cosine Strategy + +The Cosine Strategy in Crawl4AI uses similarity-based clustering to identify and extract relevant content sections from web pages. This strategy is particularly useful when you need to find and extract content based on semantic similarity rather than structural patterns. + +## How It Works + +The Cosine Strategy: +1. Breaks down page content into meaningful chunks +2. Converts text into vector representations +3. Calculates similarity between chunks +4. Clusters similar content together +5. Ranks and filters content based on relevance + +## Basic Usage + +```python +from crawl4ai.extraction_strategy import CosineStrategy + +strategy = CosineStrategy( + semantic_filter="product reviews", # Target content type + word_count_threshold=10, # Minimum words per cluster + sim_threshold=0.3 # Similarity threshold +) + +async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/reviews", + extraction_strategy=strategy + ) + + content = result.extracted_content +``` + +## Configuration Options + +### Core Parameters + +```python +CosineStrategy( + # Content Filtering + semantic_filter: str = None, # Keywords/topic for content filtering + word_count_threshold: int = 10, # Minimum words per cluster + sim_threshold: float = 0.3, # Similarity threshold (0.0 to 1.0) + + # Clustering Parameters + max_dist: float = 0.2, # Maximum distance for clustering + linkage_method: str = 'ward', # Clustering linkage method + top_k: int = 3, # Number of top categories to extract + + # Model Configuration + model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', # Embedding model + + verbose: bool = False # Enable logging +) +``` + +### Parameter Details + +1. **semantic_filter** + - Sets the target topic or content type + - Use keywords relevant to your desired content + - Example: "technical specifications", "user reviews", "pricing information" + +2. **sim_threshold** + - Controls how similar content must be to be grouped together + - Higher values (e.g., 0.8) mean stricter matching + - Lower values (e.g., 0.3) allow more variation + ```python + # Strict matching + strategy = CosineStrategy(sim_threshold=0.8) + + # Loose matching + strategy = CosineStrategy(sim_threshold=0.3) + ``` + +3. **word_count_threshold** + - Filters out short content blocks + - Helps eliminate noise and irrelevant content + ```python + # Only consider substantial paragraphs + strategy = CosineStrategy(word_count_threshold=50) + ``` + +4. **top_k** + - Number of top content clusters to return + - Higher values return more diverse content + ```python + # Get top 5 most relevant content clusters + strategy = CosineStrategy(top_k=5) + ``` + +## Use Cases + +### 1. Article Content Extraction +```python +strategy = CosineStrategy( + semantic_filter="main article content", + word_count_threshold=100, # Longer blocks for articles + top_k=1 # Usually want single main content +) + +result = await crawler.arun( + url="https://example.com/blog/post", + extraction_strategy=strategy +) +``` + +### 2. Product Review Analysis +```python +strategy = CosineStrategy( + semantic_filter="customer reviews and ratings", + word_count_threshold=20, # Reviews can be shorter + top_k=10, # Get multiple reviews + sim_threshold=0.4 # Allow variety in review content +) +``` + +### 3. Technical Documentation +```python +strategy = CosineStrategy( + semantic_filter="technical specifications documentation", + word_count_threshold=30, + sim_threshold=0.6, # Stricter matching for technical content + max_dist=0.3 # Allow related technical sections +) +``` + +## Advanced Features + +### Custom Clustering +```python +strategy = CosineStrategy( + linkage_method='complete', # Alternative clustering method + max_dist=0.4, # Larger clusters + model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2' # Multilingual support +) +``` + +### Content Filtering Pipeline +```python +strategy = CosineStrategy( + semantic_filter="pricing plans features", + word_count_threshold=15, + sim_threshold=0.5, + top_k=3 +) + +async def extract_pricing_features(url: str): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url=url, + extraction_strategy=strategy + ) + + if result.success: + content = json.loads(result.extracted_content) + return { + 'pricing_features': content, + 'clusters': len(content), + 'similarity_scores': [item['score'] for item in content] + } +``` + +## Best Practices + +1. **Adjust Thresholds Iteratively** + - Start with default values + - Adjust based on results + - Monitor clustering quality + +2. **Choose Appropriate Word Count Thresholds** + - Higher for articles (100+) + - Lower for reviews/comments (20+) + - Medium for product descriptions (50+) + +3. **Optimize Performance** + ```python + strategy = CosineStrategy( + word_count_threshold=10, # Filter early + top_k=5, # Limit results + verbose=True # Monitor performance + ) + ``` + +4. **Handle Different Content Types** + ```python + # For mixed content pages + strategy = CosineStrategy( + semantic_filter="product features", + sim_threshold=0.4, # More flexible matching + max_dist=0.3, # Larger clusters + top_k=3 # Multiple relevant sections + ) + ``` + +## Error Handling + +```python +try: + result = await crawler.arun( + url="https://example.com", + extraction_strategy=strategy + ) + + if result.success: + content = json.loads(result.extracted_content) + if not content: + print("No relevant content found") + else: + print(f"Extraction failed: {result.error_message}") + +except Exception as e: + print(f"Error during extraction: {str(e)}") +``` + +The Cosine Strategy is particularly effective when: +- Content structure is inconsistent +- You need semantic understanding +- You want to find similar content blocks +- Structure-based extraction (CSS/XPath) isn't reliable + +It works well with other strategies and can be used as a pre-processing step for LLM-based extraction. \ No newline at end of file diff --git a/docs/md_v2/extraction/css-advanced.md b/docs/md_v2/extraction/css-advanced.md new file mode 100644 index 0000000000000000000000000000000000000000..393b79a5605097c7a7064a00eb9a713170d1ba30 --- /dev/null +++ b/docs/md_v2/extraction/css-advanced.md @@ -0,0 +1,282 @@ +# Advanced Usage of JsonCssExtractionStrategy + +While the basic usage of JsonCssExtractionStrategy is powerful for simple structures, its true potential shines when dealing with complex, nested HTML structures. This section will explore advanced usage scenarios, demonstrating how to extract nested objects, lists, and nested lists. + +## Hypothetical Website Example + +Let's consider a hypothetical e-commerce website that displays product categories, each containing multiple products. Each product has details, reviews, and related items. This complex structure will allow us to demonstrate various advanced features of JsonCssExtractionStrategy. + +Assume the HTML structure looks something like this: + +```html +
    +

    Electronics

    +
    +

    Smartphone X

    +

    $999

    +
    + TechCorp + X-2000 +
    +
      +
    • 5G capable
    • +
    • 6.5" OLED screen
    • +
    • 128GB storage
    • +
    +
    +
    + John D. + 4.5 +

    Great phone, love the camera!

    +
    +
    + Jane S. + 5 +

    Best smartphone I've ever owned.

    +
    +
    + +
    + +
    +``` + +Now, let's create a schema to extract this complex structure: + +```python +schema = { + "name": "E-commerce Product Catalog", + "baseSelector": "div.category", + "fields": [ + { + "name": "category_name", + "selector": "h2.category-name", + "type": "text" + }, + { + "name": "products", + "selector": "div.product", + "type": "nested_list", + "fields": [ + { + "name": "name", + "selector": "h3.product-name", + "type": "text" + }, + { + "name": "price", + "selector": "p.product-price", + "type": "text" + }, + { + "name": "details", + "selector": "div.product-details", + "type": "nested", + "fields": [ + { + "name": "brand", + "selector": "span.brand", + "type": "text" + }, + { + "name": "model", + "selector": "span.model", + "type": "text" + } + ] + }, + { + "name": "features", + "selector": "ul.product-features li", + "type": "list", + "fields": [ + { + "name": "feature", + "type": "text" + } + ] + }, + { + "name": "reviews", + "selector": "div.review", + "type": "nested_list", + "fields": [ + { + "name": "reviewer", + "selector": "span.reviewer", + "type": "text" + }, + { + "name": "rating", + "selector": "span.rating", + "type": "text" + }, + { + "name": "comment", + "selector": "p.review-text", + "type": "text" + } + ] + }, + { + "name": "related_products", + "selector": "ul.related-products li", + "type": "list", + "fields": [ + { + "name": "name", + "selector": "span.related-name", + "type": "text" + }, + { + "name": "price", + "selector": "span.related-price", + "type": "text" + } + ] + } + ] + } + ] +} +``` + +This schema demonstrates several advanced features: + +1. **Nested Objects**: The `details` field is a nested object within each product. +2. **Simple Lists**: The `features` field is a simple list of text items. +3. **Nested Lists**: The `products` field is a nested list, where each item is a complex object. +4. **Lists of Objects**: The `reviews` and `related_products` fields are lists of objects. + +Let's break down the key concepts: + +### Nested Objects + +To create a nested object, use `"type": "nested"` and provide a `fields` array for the nested structure: + +```python +{ + "name": "details", + "selector": "div.product-details", + "type": "nested", + "fields": [ + { + "name": "brand", + "selector": "span.brand", + "type": "text" + }, + { + "name": "model", + "selector": "span.model", + "type": "text" + } + ] +} +``` + +### Simple Lists + +For a simple list of identical items, use `"type": "list"`: + +```python +{ + "name": "features", + "selector": "ul.product-features li", + "type": "list", + "fields": [ + { + "name": "feature", + "type": "text" + } + ] +} +``` + +### Nested Lists + +For a list of complex objects, use `"type": "nested_list"`: + +```python +{ + "name": "products", + "selector": "div.product", + "type": "nested_list", + "fields": [ + // ... fields for each product + ] +} +``` + +### Lists of Objects + +Similar to nested lists, but typically used for simpler objects within the list: + +```python +{ + "name": "related_products", + "selector": "ul.related-products li", + "type": "list", + "fields": [ + { + "name": "name", + "selector": "span.related-name", + "type": "text" + }, + { + "name": "price", + "selector": "span.related-price", + "type": "text" + } + ] +} +``` + +## Using the Advanced Schema + +To use this advanced schema with AsyncWebCrawler: + +```python +import json +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +async def extract_complex_product_data(): + extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) + + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html", + extraction_strategy=extraction_strategy, + bypass_cache=True, + ) + + assert result.success, "Failed to crawl the page" + + product_data = json.loads(result.extracted_content) + print(json.dumps(product_data, indent=2)) + +asyncio.run(extract_complex_product_data()) +``` + +This will produce a structured JSON output that captures the complex hierarchy of the product catalog, including nested objects, lists, and nested lists. + +## Tips for Advanced Usage + +1. **Start Simple**: Begin with a basic schema and gradually add complexity. +2. **Test Incrementally**: Test each part of your schema separately before combining them. +3. **Use Chrome DevTools**: The Element Inspector is invaluable for identifying the correct selectors. +4. **Handle Missing Data**: Use the `default` key in your field definitions to handle cases where data might be missing. +5. **Leverage Transforms**: Use the `transform` key to clean or format extracted data (e.g., converting prices to numbers). +6. **Consider Performance**: Very complex schemas might slow down extraction. Balance complexity with performance needs. + +By mastering these advanced techniques, you can use JsonCssExtractionStrategy to extract highly structured data from even the most complex web pages, making it a powerful tool for web scraping and data analysis tasks. \ No newline at end of file diff --git a/docs/md_v2/extraction/css.md b/docs/md_v2/extraction/css.md new file mode 100644 index 0000000000000000000000000000000000000000..3b5075a6f77333542209d87e6884814487209a1a --- /dev/null +++ b/docs/md_v2/extraction/css.md @@ -0,0 +1,142 @@ +# JSON CSS Extraction Strategy with AsyncWebCrawler + +The `JsonCssExtractionStrategy` is a powerful feature of Crawl4AI that allows you to extract structured data from web pages using CSS selectors. This method is particularly useful when you need to extract specific data points from a consistent HTML structure, such as tables or repeated elements. Here's how to use it with the AsyncWebCrawler. + +## Overview + +The `JsonCssExtractionStrategy` works by defining a schema that specifies: +1. A base CSS selector for the repeating elements +2. Fields to extract from each element, each with its own CSS selector + +This strategy is fast and efficient, as it doesn't rely on external services like LLMs for extraction. + +## Example: Extracting Cryptocurrency Prices from Coinbase + +Let's look at an example that extracts cryptocurrency prices from the Coinbase explore page. + +```python +import json +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +async def extract_structured_data_using_css_extractor(): + print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") + + # Define the extraction schema + schema = { + "name": "Coinbase Crypto Prices", + "baseSelector": ".cds-tableRow-t45thuk", + "fields": [ + { + "name": "crypto", + "selector": "td:nth-child(1) h2", + "type": "text", + }, + { + "name": "symbol", + "selector": "td:nth-child(1) p", + "type": "text", + }, + { + "name": "price", + "selector": "td:nth-child(2)", + "type": "text", + } + ], + } + + # Create the extraction strategy + extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) + + # Use the AsyncWebCrawler with the extraction strategy + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.coinbase.com/explore", + extraction_strategy=extraction_strategy, + bypass_cache=True, + ) + + assert result.success, "Failed to crawl the page" + + # Parse the extracted content + crypto_prices = json.loads(result.extracted_content) + print(f"Successfully extracted {len(crypto_prices)} cryptocurrency prices") + print(json.dumps(crypto_prices[0], indent=2)) + + return crypto_prices + +# Run the async function +asyncio.run(extract_structured_data_using_css_extractor()) +``` + +## Explanation of the Schema + +The schema defines how to extract the data: + +- `name`: A descriptive name for the extraction task. +- `baseSelector`: The CSS selector for the repeating elements (in this case, table rows). +- `fields`: An array of fields to extract from each element: + - `name`: The name to give the extracted data. + - `selector`: The CSS selector to find the specific data within the base element. + - `type`: The type of data to extract (usually "text" for textual content). + +## Advantages of JsonCssExtractionStrategy + +1. **Speed**: CSS selectors are fast to execute, making this method efficient for large datasets. +2. **Precision**: You can target exactly the elements you need. +3. **Structured Output**: The result is already structured as JSON, ready for further processing. +4. **No External Dependencies**: Unlike LLM-based strategies, this doesn't require any API calls to external services. + +## Tips for Using JsonCssExtractionStrategy + +1. **Inspect the Page**: Use browser developer tools to identify the correct CSS selectors. +2. **Test Selectors**: Verify your selectors in the browser console before using them in the script. +3. **Handle Dynamic Content**: If the page uses JavaScript to load content, you may need to combine this with JS execution (see the Advanced Usage section). +4. **Error Handling**: Always check the `result.success` flag and handle potential failures. + +## Advanced Usage: Combining with JavaScript Execution + +For pages that load data dynamically, you can combine the `JsonCssExtractionStrategy` with JavaScript execution: + +```python +async def extract_dynamic_structured_data(): + schema = { + "name": "Dynamic Crypto Prices", + "baseSelector": ".crypto-row", + "fields": [ + {"name": "name", "selector": ".crypto-name", "type": "text"}, + {"name": "price", "selector": ".crypto-price", "type": "text"}, + ] + } + + js_code = """ + window.scrollTo(0, document.body.scrollHeight); + await new Promise(resolve => setTimeout(resolve, 2000)); // Wait for 2 seconds + """ + + extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) + + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://example.com/crypto-prices", + extraction_strategy=extraction_strategy, + js_code=js_code, + wait_for=".crypto-row:nth-child(20)", # Wait for 20 rows to load + bypass_cache=True, + ) + + crypto_data = json.loads(result.extracted_content) + print(f"Extracted {len(crypto_data)} cryptocurrency entries") + +asyncio.run(extract_dynamic_structured_data()) +``` + +This advanced example demonstrates how to: +1. Execute JavaScript to trigger dynamic content loading. +2. Wait for a specific condition (20 rows loaded) before extraction. +3. Extract data from the dynamically loaded content. + +By mastering the `JsonCssExtractionStrategy`, you can efficiently extract structured data from a wide variety of web pages, making it a valuable tool in your web scraping toolkit. + +For more details on schema definitions and advanced extraction strategies, check out the[Advanced JsonCssExtraction](./css-advanced.md). \ No newline at end of file diff --git a/docs/md_v2/extraction/llm.md b/docs/md_v2/extraction/llm.md new file mode 100644 index 0000000000000000000000000000000000000000..ca5621476c5cc75af6db4cdda83512c57a801d58 --- /dev/null +++ b/docs/md_v2/extraction/llm.md @@ -0,0 +1,179 @@ +# LLM Extraction with AsyncWebCrawler + +Crawl4AI's AsyncWebCrawler allows you to use Language Models (LLMs) to extract structured data or relevant content from web pages asynchronously. Below are two examples demonstrating how to use `LLMExtractionStrategy` for different purposes with the AsyncWebCrawler. + +## Example 1: Extract Structured Data + +In this example, we use the `LLMExtractionStrategy` to extract structured data (model names and their fees) from the OpenAI pricing page. + +```python +import os +import json +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai.extraction_strategy import LLMExtractionStrategy +from pydantic import BaseModel, Field + +class OpenAIModelFee(BaseModel): + model_name: str = Field(..., description="Name of the OpenAI model.") + input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") + output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") + +async def extract_openai_fees(): + url = 'https://openai.com/api/pricing/' + + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url=url, + word_count_threshold=1, + extraction_strategy=LLMExtractionStrategy( + provider="openai/gpt-4o", # Or use ollama like provider="ollama/nemotron" + api_token=os.getenv('OPENAI_API_KEY'), + schema=OpenAIModelFee.model_json_schema(), + extraction_type="schema", + instruction="From the crawled content, extract all mentioned model names along with their " + "fees for input and output tokens. Make sure not to miss anything in the entire content. " + 'One extracted model JSON format should look like this: ' + '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }' + ), + bypass_cache=True, + ) + + model_fees = json.loads(result.extracted_content) + print(f"Number of models extracted: {len(model_fees)}") + + with open(".data/openai_fees.json", "w", encoding="utf-8") as f: + json.dump(model_fees, f, indent=2) + +asyncio.run(extract_openai_fees()) +``` + +## Example 2: Extract Relevant Content + +In this example, we instruct the LLM to extract only content related to technology from the NBC News business page. + +```python +import os +import json +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +async def extract_tech_content(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + extraction_strategy=LLMExtractionStrategy( + provider="openai/gpt-4o", + api_token=os.getenv('OPENAI_API_KEY'), + instruction="Extract only content related to technology" + ), + bypass_cache=True, + ) + + tech_content = json.loads(result.extracted_content) + print(f"Number of tech-related items extracted: {len(tech_content)}") + + with open(".data/tech_content.json", "w", encoding="utf-8") as f: + json.dump(tech_content, f, indent=2) + +asyncio.run(extract_tech_content()) +``` + +## Advanced Usage: Combining JS Execution with LLM Extraction + +This example demonstrates how to combine JavaScript execution with LLM extraction to handle dynamic content: + +```python +async def extract_dynamic_content(): + js_code = """ + const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); + if (loadMoreButton) { + loadMoreButton.click(); + await new Promise(resolve => setTimeout(resolve, 2000)); + } + """ + + wait_for = """ + () => { + const articles = document.querySelectorAll('article.tease-card'); + return articles.length > 10; + } + """ + + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + js_code=js_code, + wait_for=wait_for, + css_selector="article.tease-card", + extraction_strategy=LLMExtractionStrategy( + provider="openai/gpt-4o", + api_token=os.getenv('OPENAI_API_KEY'), + instruction="Summarize each article, focusing on technology-related content" + ), + bypass_cache=True, + ) + + summaries = json.loads(result.extracted_content) + print(f"Number of summarized articles: {len(summaries)}") + + with open(".data/tech_summaries.json", "w", encoding="utf-8") as f: + json.dump(summaries, f, indent=2) + +asyncio.run(extract_dynamic_content()) +``` + +## Customizing LLM Provider + +Crawl4AI uses the `litellm` library under the hood, which allows you to use any LLM provider you want. Just pass the correct model name and API token: + +```python +extraction_strategy=LLMExtractionStrategy( + provider="your_llm_provider/model_name", + api_token="your_api_token", + instruction="Your extraction instruction" +) +``` + +This flexibility allows you to integrate with various LLM providers and tailor the extraction process to your specific needs. + +## Error Handling and Retries + +When working with external LLM APIs, it's important to handle potential errors and implement retry logic. Here's an example of how you might do this: + +```python +import asyncio +from tenacity import retry, stop_after_attempt, wait_exponential + +class LLMExtractionError(Exception): + pass + +@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) +async def extract_with_retry(crawler, url, extraction_strategy): + try: + result = await crawler.arun(url=url, extraction_strategy=extraction_strategy, bypass_cache=True) + return json.loads(result.extracted_content) + except Exception as e: + raise LLMExtractionError(f"Failed to extract content: {str(e)}") + +async def main(): + async with AsyncWebCrawler(verbose=True) as crawler: + try: + content = await extract_with_retry( + crawler, + "https://www.example.com", + LLMExtractionStrategy( + provider="openai/gpt-4o", + api_token=os.getenv('OPENAI_API_KEY'), + instruction="Extract and summarize main points" + ) + ) + print("Extracted content:", content) + except LLMExtractionError as e: + print(f"Extraction failed after retries: {e}") + +asyncio.run(main()) +``` + +This example uses the `tenacity` library to implement a retry mechanism with exponential backoff, which can help handle temporary failures or rate limiting from the LLM API. \ No newline at end of file diff --git a/docs/md_v2/extraction/overview.md b/docs/md_v2/extraction/overview.md new file mode 100644 index 0000000000000000000000000000000000000000..7c52447501bac3bcfe0061880375cd6f4853b1d3 --- /dev/null +++ b/docs/md_v2/extraction/overview.md @@ -0,0 +1,226 @@ +# Extraction Strategies Overview + +Crawl4AI provides powerful extraction strategies to help you get structured data from web pages. Each strategy is designed for specific use cases and offers different approaches to data extraction. + +## Available Strategies + +### [LLM-Based Extraction](llm.md) + +`LLMExtractionStrategy` uses Language Models to extract structured data from web content. This approach is highly flexible and can understand content semantically. + +```python +from pydantic import BaseModel +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +class Product(BaseModel): + name: str + price: float + description: str + +strategy = LLMExtractionStrategy( + provider="ollama/llama2", + schema=Product.schema(), + instruction="Extract product details from the page" +) + +result = await crawler.arun( + url="https://example.com/product", + extraction_strategy=strategy +) +``` + +**Best for:** +- Complex data structures +- Content requiring interpretation +- Flexible content formats +- Natural language processing + +### [CSS-Based Extraction](css.md) + +`JsonCssExtractionStrategy` extracts data using CSS selectors. This is fast, reliable, and perfect for consistently structured pages. + +```python +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +schema = { + "name": "Product Listing", + "baseSelector": ".product-card", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "price", "selector": ".price", "type": "text"}, + {"name": "image", "selector": "img", "type": "attribute", "attribute": "src"} + ] +} + +strategy = JsonCssExtractionStrategy(schema) + +result = await crawler.arun( + url="https://example.com/products", + extraction_strategy=strategy +) +``` + +**Best for:** +- E-commerce product listings +- News article collections +- Structured content pages +- High-performance needs + +### [Cosine Strategy](cosine.md) + +`CosineStrategy` uses similarity-based clustering to identify and extract relevant content sections. + +```python +from crawl4ai.extraction_strategy import CosineStrategy + +strategy = CosineStrategy( + semantic_filter="product reviews", # Content focus + word_count_threshold=10, # Minimum words per cluster + sim_threshold=0.3, # Similarity threshold + max_dist=0.2, # Maximum cluster distance + top_k=3 # Number of top clusters to extract +) + +result = await crawler.arun( + url="https://example.com/reviews", + extraction_strategy=strategy +) +``` + +**Best for:** +- Content similarity analysis +- Topic clustering +- Relevant content extraction +- Pattern recognition in text + +## Strategy Selection Guide + +Choose your strategy based on these factors: + +1. **Content Structure** + - Well-structured HTML → Use CSS Strategy + - Natural language text → Use LLM Strategy + - Mixed/Complex content → Use Cosine Strategy + +2. **Performance Requirements** + - Fastest: CSS Strategy + - Moderate: Cosine Strategy + - Variable: LLM Strategy (depends on provider) + +3. **Accuracy Needs** + - Highest structure accuracy: CSS Strategy + - Best semantic understanding: LLM Strategy + - Best content relevance: Cosine Strategy + +## Combining Strategies + +You can combine strategies for more powerful extraction: + +```python +# First use CSS strategy for initial structure +css_result = await crawler.arun( + url="https://example.com", + extraction_strategy=css_strategy +) + +# Then use LLM for semantic analysis +llm_result = await crawler.arun( + url="https://example.com", + extraction_strategy=llm_strategy +) +``` + +## Common Use Cases + +1. **E-commerce Scraping** + ```python + # CSS Strategy for product listings + schema = { + "name": "Products", + "baseSelector": ".product", + "fields": [ + {"name": "name", "selector": ".title", "type": "text"}, + {"name": "price", "selector": ".price", "type": "text"} + ] + } + ``` + +2. **News Article Extraction** + ```python + # LLM Strategy for article content + class Article(BaseModel): + title: str + content: str + author: str + date: str + + strategy = LLMExtractionStrategy( + provider="ollama/llama2", + schema=Article.schema() + ) + ``` + +3. **Content Analysis** + ```python + # Cosine Strategy for topic analysis + strategy = CosineStrategy( + semantic_filter="technology trends", + top_k=5 + ) + ``` + + +## Input Formats +All extraction strategies support different input formats to give you more control over how content is processed: + +- **markdown** (default): Uses the raw markdown conversion of the HTML content. Best for general text extraction where HTML structure isn't critical. +- **html**: Uses the raw HTML content. Useful when you need to preserve HTML structure or extract data from specific HTML elements. +- **fit_markdown**: Uses the cleaned and filtered markdown content. Best for extracting relevant content while removing noise. Requires a markdown generator with content filter to be configured. + +To specify an input format: +```python +strategy = LLMExtractionStrategy( + input_format="html", # or "markdown" or "fit_markdown" + provider="openai/gpt-4", + instruction="Extract product information" +) +``` + +Note: When using "fit_markdown", ensure your CrawlerRunConfig includes a markdown generator with content filter: +```python +config = CrawlerRunConfig( + extraction_strategy=strategy, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter() # Content filter goes here for fit_markdown + ) +) +``` + +If fit_markdown is requested but not available (no markdown generator or content filter), the system will automatically fall back to raw markdown with a warning. + +## Best Practices + +1. **Choose the Right Strategy** + - Start with CSS for structured data + - Use LLM for complex interpretation + - Try Cosine for content relevance + +2. **Optimize Performance** + - Cache LLM results + - Keep CSS selectors specific + - Tune similarity thresholds + +3. **Handle Errors** + ```python + result = await crawler.arun( + url="https://example.com", + extraction_strategy=strategy + ) + + if not result.success: + print(f"Extraction failed: {result.error_message}") + else: + data = json.loads(result.extracted_content) + ``` + +Each strategy has its strengths and optimal use cases. Explore the detailed documentation for each strategy to learn more about their specific features and configurations. \ No newline at end of file diff --git a/docs/md_v2/index.md b/docs/md_v2/index.md new file mode 100644 index 0000000000000000000000000000000000000000..65ea6da809dfd69ff00985110b81cf7eccb90345 --- /dev/null +++ b/docs/md_v2/index.md @@ -0,0 +1,113 @@ +# Crawl4AI + +Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI. + +## Introduction + +Crawl4AI has one clear task: to make crawling and data extraction from web pages easy and efficient, especially for large language models (LLMs) and AI applications. Whether you are using it as a REST API or a Python library, Crawl4AI offers a robust and flexible solution with full asynchronous support. + +## Quick Start + +Here's a quick example to show you how easy it is to use Crawl4AI with its asynchronous capabilities: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + # Create an instance of AsyncWebCrawler + async with AsyncWebCrawler(verbose=True) as crawler: + # Run the crawler on a URL + result = await crawler.arun(url="https://www.nbcnews.com/business") + + # Print the extracted content + print(result.markdown) + +# Run the async main function +asyncio.run(main()) +``` + +## Key Features ✨ + +- 🆓 Completely free and open-source +- 🚀 Blazing fast performance, outperforming many paid services +- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) +- 📄 Fit markdown generation for extracting main article content. +- 🌐 Multi-browser support (Chromium, Firefox, WebKit) +- 🌍 Supports crawling multiple URLs simultaneously +- 🎨 Extracts and returns all media tags (Images, Audio, and Video) +- 🔗 Extracts all external and internal links +- 📚 Extracts metadata from the page +- 🔄 Custom hooks for authentication, headers, and page modifications +- 🕵️ User-agent customization +- 🖼️ Takes screenshots of pages with enhanced error handling +- 📜 Executes multiple custom JavaScripts before crawling +- 📊 Generates structured output without LLM using JsonCssExtractionStrategy +- 📚 Various chunking strategies: topic-based, regex, sentence, and more +- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more +- 🎯 CSS selector support for precise data extraction +- 📝 Passes instructions/keywords to refine extraction +- 🔒 Proxy support with authentication for enhanced access +- 🔄 Session management for complex multi-page crawling +- 🌐 Asynchronous architecture for improved performance +- 🖼️ Improved image processing with lazy-loading detection +- 🕰️ Enhanced handling of delayed content loading +- 🔑 Custom headers support for LLM interactions +- 🖼️ iframe content extraction for comprehensive analysis +- ⏱️ Flexible timeout and delayed content retrieval options + +## Documentation Structure + +Our documentation is organized into several sections: + +### Basic Usage +- [Installation](basic/installation.md) +- [Quick Start](basic/quickstart.md) +- [Simple Crawling](basic/simple-crawling.md) +- [Browser Configuration](basic/browser-config.md) +- [Content Selection](basic/content-selection.md) +- [Output Formats](basic/output-formats.md) +- [Page Interaction](basic/page-interaction.md) + +### Advanced Features +- [Magic Mode](advanced/magic-mode.md) +- [Session Management](advanced/session-management.md) +- [Hooks & Authentication](advanced/hooks-auth.md) +- [Proxy & Security](advanced/proxy-security.md) +- [Content Processing](advanced/content-processing.md) + +### Extraction & Processing +- [Extraction Strategies Overview](extraction/overview.md) +- [LLM Integration](extraction/llm.md) +- [CSS-Based Extraction](extraction/css.md) +- [Cosine Strategy](extraction/cosine.md) +- [Chunking Strategies](extraction/chunking.md) + +### API Reference +- [AsyncWebCrawler](api/async-webcrawler.md) +- [CrawlResult](api/crawl-result.md) +- [Extraction Strategies](api/strategies.md) +- [arun() Method Parameters](api/arun.md) + +### Examples +- Coming soon! + +## Getting Started + +1. Install Crawl4AI: +```bash +pip install crawl4ai +``` + +2. Check out our [Quick Start Guide](basic/quickstart.md) to begin crawling web pages. + +3. Explore our [examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) to see Crawl4AI in action. + +## Support + +For questions, suggestions, or issues: +- GitHub Issues: [Report a Bug](https://github.com/unclecode/crawl4ai/issues) +- Twitter: [@unclecode](https://twitter.com/unclecode) +- Website: [crawl4ai.com](https://crawl4ai.com) + +Happy Crawling! 🕸️🚀 \ No newline at end of file diff --git a/docs/md_v2/tutorial/episode_01_Introduction_to_Crawl4AI_and_Basic_Installation.md b/docs/md_v2/tutorial/episode_01_Introduction_to_Crawl4AI_and_Basic_Installation.md new file mode 100644 index 0000000000000000000000000000000000000000..fb1846b557d0e0b88d911425a5e3db5c396790d5 --- /dev/null +++ b/docs/md_v2/tutorial/episode_01_Introduction_to_Crawl4AI_and_Basic_Installation.md @@ -0,0 +1,51 @@ +# Crawl4AI + +## Episode 1: Introduction to Crawl4AI and Basic Installation + +### Quick Intro +Walk through installation from PyPI, setup, and verification. Show how to install with options like `torch` or `transformer` for advanced capabilities. + +Here's a condensed outline of the **Installation and Setup** video content: + +--- + +1) **Introduction to Crawl4AI**: Briefly explain that Crawl4AI is a powerful tool for web scraping, data extraction, and content processing, with customizable options for various needs. + +2) **Installation Overview**: + + - **Basic Install**: Run `pip install crawl4ai` and `playwright install` (to set up browser dependencies). + + - **Optional Advanced Installs**: + - `pip install crawl4ai[torch]` - Adds PyTorch for clustering. + - `pip install crawl4ai[transformer]` - Adds support for LLM-based extraction. + - `pip install crawl4ai[all]` - Installs all features for complete functionality. + +3) **Verifying the Installation**: + + - Walk through a simple test script to confirm the setup: + ```python + import asyncio + from crawl4ai import AsyncWebCrawler + + async def main(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url="https://www.example.com") + print(result.markdown[:500]) # Show first 500 characters + + asyncio.run(main()) + ``` + - Explain that this script initializes the crawler and runs it on a test URL, displaying part of the extracted content to verify functionality. + +4) **Important Tips**: + + - **Run** `playwright install` **after installation** to set up dependencies. + - **For full performance** on text-related tasks, run `crawl4ai-download-models` after installing with `[torch]`, `[transformer]`, or `[all]` options. + - If you encounter issues, refer to the documentation or GitHub issues. + +5) **Wrap Up**: + + - Introduce the next topic in the series, which will cover Crawl4AI's browser configuration options (like choosing between `chromium`, `firefox`, and `webkit`). + +--- + +This structure provides a concise, effective guide to get viewers up and running with Crawl4AI in minutes. \ No newline at end of file diff --git a/docs/md_v2/tutorial/episode_02_Overview_of_Advanced_Features.md b/docs/md_v2/tutorial/episode_02_Overview_of_Advanced_Features.md new file mode 100644 index 0000000000000000000000000000000000000000..c4fd09df8a38c27ce2b3c3bf3cf88d1d570f6049 --- /dev/null +++ b/docs/md_v2/tutorial/episode_02_Overview_of_Advanced_Features.md @@ -0,0 +1,78 @@ +# Crawl4AI + +## Episode 2: Overview of Advanced Features + +### Quick Intro +A general overview of advanced features like hooks, CSS selectors, and JSON CSS extraction. + +Here's a condensed outline for an **Overview of Advanced Features** video covering Crawl4AI's powerful customization and extraction options: + +--- + +### **Overview of Advanced Features** + +1) **Introduction to Advanced Features**: + + - Briefly introduce Crawl4AI’s advanced tools, which let users go beyond basic crawling to customize and fine-tune their scraping workflows. + +2) **Taking Screenshots**: + + - Explain the screenshot capability for capturing page state and verifying content. + - **Example**: + ```python + result = await crawler.arun(url="https://www.example.com", screenshot=True) + ``` + - Mention that screenshots are saved as a base64 string in `result`, allowing easy decoding and saving. + +3) **Media and Link Extraction**: + + - Demonstrate how to pull all media (images, videos) and links (internal and external) from a page for deeper analysis or content gathering. + - **Example**: + ```python + result = await crawler.arun(url="https://www.example.com") + print("Media:", result.media) + print("Links:", result.links) + ``` + +4) **Custom User Agent**: + + - Show how to set a custom user agent to disguise the crawler or simulate specific devices/browsers. + - **Example**: + ```python + result = await crawler.arun(url="https://www.example.com", user_agent="Mozilla/5.0 (compatible; MyCrawler/1.0)") + ``` + +5) **Custom Hooks for Enhanced Control**: + + - Briefly cover how to use hooks, which allow custom actions like setting headers or handling login during the crawl. + - **Example**: Setting a custom header with `before_get_url` hook. + ```python + async def before_get_url(page): + await page.set_extra_http_headers({"X-Test-Header": "test"}) + ``` + +6) **CSS Selectors for Targeted Extraction**: + + - Explain the use of CSS selectors to extract specific elements, ideal for structured data like articles or product details. + - **Example**: + ```python + result = await crawler.arun(url="https://www.example.com", css_selector="h2") + print("H2 Tags:", result.extracted_content) + ``` + +7) **Crawling Inside Iframes**: + + - Mention how enabling `process_iframes=True` allows extracting content within iframes, useful for sites with embedded content or ads. + - **Example**: + ```python + result = await crawler.arun(url="https://www.example.com", process_iframes=True) + ``` + +8) **Wrap-Up**: + + - Summarize these advanced features and how they allow users to customize every part of their web scraping experience. + - Tease upcoming videos where each feature will be explored in detail. + +--- + +This covers each advanced feature with a brief example, providing a useful overview to prepare viewers for the more in-depth videos. \ No newline at end of file diff --git a/docs/md_v2/tutorial/episode_03_Browser_Configurations_&_Headless_Crawling.md b/docs/md_v2/tutorial/episode_03_Browser_Configurations_&_Headless_Crawling.md new file mode 100644 index 0000000000000000000000000000000000000000..45f1a353b065cca0f3f64d4b7427d50539391af0 --- /dev/null +++ b/docs/md_v2/tutorial/episode_03_Browser_Configurations_&_Headless_Crawling.md @@ -0,0 +1,65 @@ +# Crawl4AI + +## Episode 3: Browser Configurations & Headless Crawling + +### Quick Intro +Explain browser options (`chromium`, `firefox`, `webkit`) and settings for headless mode, caching, and verbose logging. + +Here’s a streamlined outline for the **Browser Configurations & Headless Crawling** video: + +--- + +### **Browser Configurations & Headless Crawling** + +1) **Overview of Browser Options**: + + - Crawl4AI supports three browser engines: + - **Chromium** (default) - Highly compatible. + - **Firefox** - Great for specialized use cases. + - **Webkit** - Lightweight, ideal for basic needs. + - **Example**: + ```python + # Using Chromium (default) + crawler = AsyncWebCrawler(browser_type="chromium") + + # Using Firefox + crawler = AsyncWebCrawler(browser_type="firefox") + + # Using WebKit + crawler = AsyncWebCrawler(browser_type="webkit") + ``` + +2) **Headless Mode**: + + - Headless mode runs the browser without a visible GUI, making it faster and less resource-intensive. + - To enable or disable: + ```python + # Headless mode (default is True) + crawler = AsyncWebCrawler(headless=True) + + # Disable headless mode for debugging + crawler = AsyncWebCrawler(headless=False) + ``` + +3) **Verbose Logging**: + - Use `verbose=True` to get detailed logs for each action, useful for debugging: + ```python + crawler = AsyncWebCrawler(verbose=True) + ``` + +4) **Running a Basic Crawl with Configuration**: + - Example of a simple crawl with custom browser settings: + ```python + async with AsyncWebCrawler(browser_type="firefox", headless=True, verbose=True) as crawler: + result = await crawler.arun(url="https://www.example.com") + print(result.markdown[:500]) # Show first 500 characters + ``` + - This example uses Firefox in headless mode with logging enabled, demonstrating the flexibility of Crawl4AI’s setup. + +5) **Recap & Next Steps**: + - Recap the power of selecting different browsers and running headless mode for speed and efficiency. + - Tease the next video: **Proxy & Security Settings** for navigating blocked or restricted content and protecting IP identity. + +--- + +This breakdown covers browser configuration essentials in Crawl4AI, providing users with practical steps to optimize their scraping setup. \ No newline at end of file diff --git a/docs/md_v2/tutorial/episode_04_Advanced_Proxy_and_Security_Settings.md b/docs/md_v2/tutorial/episode_04_Advanced_Proxy_and_Security_Settings.md new file mode 100644 index 0000000000000000000000000000000000000000..ea235962349ef44364dfe767ab52a22e05b18fc2 --- /dev/null +++ b/docs/md_v2/tutorial/episode_04_Advanced_Proxy_and_Security_Settings.md @@ -0,0 +1,90 @@ +# Crawl4AI + +## Episode 4: Advanced Proxy and Security Settings + +### Quick Intro +Showcase proxy configurations (HTTP, SOCKS5, authenticated proxies). Demo: Use rotating proxies and set custom headers to avoid IP blocking and enhance security. + +Here’s a focused outline for the **Proxy and Security Settings** video: + +--- + +### **Proxy & Security Settings** + +1) **Why Use Proxies in Web Crawling**: + + - Proxies are essential for bypassing IP-based restrictions, improving anonymity, and managing rate limits. + - Crawl4AI supports simple proxies, authenticated proxies, and proxy rotation for robust web scraping. + +2) **Basic Proxy Setup**: + + - **Using a Simple Proxy**: + ```python + # HTTP proxy + crawler = AsyncWebCrawler(proxy="http://proxy.example.com:8080") + + # SOCKS proxy + crawler = AsyncWebCrawler(proxy="socks5://proxy.example.com:1080") + ``` + +3) **Authenticated Proxies**: + + - Use `proxy_config` for proxies requiring a username and password: + ```python + proxy_config = { + "server": "http://proxy.example.com:8080", + "username": "user", + "password": "pass" + } + crawler = AsyncWebCrawler(proxy_config=proxy_config) + ``` + +4) **Rotating Proxies**: + + - Rotating proxies helps avoid IP bans by switching IP addresses for each request: + ```python + async def get_next_proxy(): + # Define proxy rotation logic here + return {"server": "http://next.proxy.com:8080"} + + async with AsyncWebCrawler() as crawler: + for url in urls: + proxy = await get_next_proxy() + crawler.update_proxy(proxy) + result = await crawler.arun(url=url) + ``` + - This setup periodically switches the proxy for enhanced security and access. + +5) **Custom Headers for Additional Security**: + + - Set custom headers to mask the crawler’s identity and avoid detection: + ```python + headers = { + "X-Forwarded-For": "203.0.113.195", + "Accept-Language": "en-US,en;q=0.9", + "Cache-Control": "no-cache", + "Pragma": "no-cache" + } + crawler = AsyncWebCrawler(headers=headers) + ``` + +6) **Combining Proxies with Magic Mode for Anti-Bot Protection**: + + - For sites with aggressive bot detection, combine `proxy` settings with `magic=True`: + ```python + async with AsyncWebCrawler(proxy="http://proxy.example.com:8080", headers={"Accept-Language": "en-US"}) as crawler: + result = await crawler.arun( + url="https://example.com", + magic=True # Enables anti-detection features + ) + ``` + - **Magic Mode** automatically enables user simulation, random timing, and browser property masking. + +7) **Wrap Up & Next Steps**: + + - Summarize the importance of proxies and anti-detection in accessing restricted content and avoiding bans. + - Tease the next video: **JavaScript Execution and Handling Dynamic Content** for working with interactive and dynamically loaded pages. + +--- + +This outline provides a practical guide to setting up proxies and security configurations, empowering users to navigate restricted sites while staying undetected. \ No newline at end of file diff --git a/docs/md_v2/tutorial/episode_05_JavaScript_Execution_and_Dynamic_Content_Handling.md b/docs/md_v2/tutorial/episode_05_JavaScript_Execution_and_Dynamic_Content_Handling.md new file mode 100644 index 0000000000000000000000000000000000000000..98d0968feb238e16a285418f0b499547b5183fe5 --- /dev/null +++ b/docs/md_v2/tutorial/episode_05_JavaScript_Execution_and_Dynamic_Content_Handling.md @@ -0,0 +1,97 @@ +# Crawl4AI + +## Episode 5: JavaScript Execution and Dynamic Content Handling + +### Quick Intro +Explain JavaScript code injection with examples (e.g., simulating scrolling, clicking ‘load more’). Demo: Extract content from a page that uses dynamic loading with lazy-loaded images. + +Here’s a focused outline for the **JavaScript Execution and Dynamic Content Handling** video: + +--- + +### **JavaScript Execution & Dynamic Content Handling** + +1) **Why JavaScript Execution Matters**: + + - Many modern websites load content dynamically via JavaScript, requiring special handling to access all elements. + - Crawl4AI can execute JavaScript on pages, enabling it to interact with elements like “load more” buttons, infinite scrolls, and content that appears only after certain actions. + +2) **Basic JavaScript Execution**: + + - Use `js_code` to execute JavaScript commands on a page: + ```python + # Scroll to bottom of the page + result = await crawler.arun( + url="https://example.com", + js_code="window.scrollTo(0, document.body.scrollHeight);" + ) + ``` + - This command scrolls to the bottom, triggering any lazy-loaded or dynamically added content. + +3) **Multiple Commands & Simulating Clicks**: + + - Combine multiple JavaScript commands to interact with elements like “load more” buttons: + ```python + js_commands = [ + "window.scrollTo(0, document.body.scrollHeight);", + "document.querySelector('.load-more').click();" + ] + result = await crawler.arun( + url="https://example.com", + js_code=js_commands + ) + ``` + - This script scrolls down and then clicks the “load more” button, useful for loading additional content blocks. + +4) **Waiting for Dynamic Content**: + + - Use `wait_for` to ensure the page loads specific elements before proceeding: + ```python + result = await crawler.arun( + url="https://example.com", + js_code="window.scrollTo(0, document.body.scrollHeight);", + wait_for="css:.dynamic-content" # Wait for elements with class `.dynamic-content` + ) + ``` + - This example waits until elements with `.dynamic-content` are loaded, helping to capture content that appears after JavaScript actions. + +5) **Handling Complex Dynamic Content (e.g., Infinite Scroll)**: + + - Combine JavaScript execution with conditional waiting to handle infinite scrolls or paginated content: + ```python + result = await crawler.arun( + url="https://example.com", + js_code=[ + "window.scrollTo(0, document.body.scrollHeight);", + "const loadMore = document.querySelector('.load-more'); if (loadMore) loadMore.click();" + ], + wait_for="js:() => document.querySelectorAll('.item').length > 10" # Wait until 10 items are loaded + ) + ``` + - This example scrolls and clicks "load more" repeatedly, waiting each time for a specified number of items to load. + +6) **Complete Example: Dynamic Content Handling with Extraction**: + + - Full example demonstrating a dynamic load and content extraction in one process: + ```python + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + js_code=[ + "window.scrollTo(0, document.body.scrollHeight);", + "document.querySelector('.load-more').click();" + ], + wait_for="css:.main-content", + css_selector=".main-content" + ) + print(result.markdown[:500]) # Output the main content extracted + ``` + +7) **Wrap Up & Next Steps**: + + - Recap how JavaScript execution allows access to dynamic content, enabling powerful interactions. + - Tease the next video: **Content Cleaning and Fit Markdown** to show how Crawl4AI can extract only the most relevant content from complex pages. + +--- + +This outline explains how to handle dynamic content and JavaScript-based interactions effectively, enabling users to scrape and interact with complex, modern websites. \ No newline at end of file diff --git a/docs/md_v2/tutorial/episode_06_Magic_Mode_and_Anti-Bot_Protection.md b/docs/md_v2/tutorial/episode_06_Magic_Mode_and_Anti-Bot_Protection.md new file mode 100644 index 0000000000000000000000000000000000000000..dfc3e5a2c15b342d3171f5ff6066d2637335bee4 --- /dev/null +++ b/docs/md_v2/tutorial/episode_06_Magic_Mode_and_Anti-Bot_Protection.md @@ -0,0 +1,86 @@ +# Crawl4AI + +## Episode 6: Magic Mode and Anti-Bot Protection + +### Quick Intro +Highlight `Magic Mode` and anti-bot features like user simulation, navigator overrides, and timing randomization. Demo: Access a site with anti-bot protection and show how `Magic Mode` seamlessly handles it. + +Here’s a concise outline for the **Magic Mode and Anti-Bot Protection** video: + +--- + +### **Magic Mode & Anti-Bot Protection** + +1) **Why Anti-Bot Protection is Important**: + + - Many websites use bot detection mechanisms to block automated scraping. Crawl4AI’s anti-detection features help avoid IP bans, CAPTCHAs, and access restrictions. + - **Magic Mode** is a one-step solution to enable a range of anti-bot features without complex configuration. + +2) **Enabling Magic Mode**: + + - Simply set `magic=True` to activate Crawl4AI’s full anti-bot suite: + ```python + result = await crawler.arun( + url="https://example.com", + magic=True # Enables all anti-detection features + ) + ``` + - This enables a blend of stealth techniques, including masking automation signals, randomizing timings, and simulating real user behavior. + +3) **What Magic Mode Does Behind the Scenes**: + + - **User Simulation**: Mimics human actions like mouse movements and scrolling. + - **Navigator Overrides**: Hides signals that indicate an automated browser. + - **Timing Randomization**: Adds random delays to simulate natural interaction patterns. + - **Cookie Handling**: Accepts and manages cookies dynamically to avoid triggers from cookie pop-ups. + +4) **Manual Anti-Bot Options (If Not Using Magic Mode)**: + + - For granular control, you can configure individual settings without Magic Mode: + ```python + result = await crawler.arun( + url="https://example.com", + simulate_user=True, # Enables human-like behavior + override_navigator=True # Hides automation fingerprints + ) + ``` + - **Use Cases**: This approach allows more specific adjustments when certain anti-bot features are needed but others are not. + +5) **Combining Proxies with Magic Mode**: + + - To avoid rate limits or IP blocks, combine Magic Mode with a proxy: + ```python + async with AsyncWebCrawler( + proxy="http://proxy.example.com:8080", + headers={"Accept-Language": "en-US"} + ) as crawler: + result = await crawler.arun( + url="https://example.com", + magic=True # Full anti-detection + ) + ``` + - This setup maximizes stealth by pairing anti-bot detection with IP obfuscation. + +6) **Example of Anti-Bot Protection in Action**: + + - Full example with Magic Mode and proxies to scrape a protected page: + ```python + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/protected-content", + magic=True, + proxy="http://proxy.example.com:8080", + wait_for="css:.content-loaded" # Wait for the main content to load + ) + print(result.markdown[:500]) # Display first 500 characters of the content + ``` + - This example ensures seamless access to protected content by combining anti-detection and waiting for full content load. + +7) **Wrap Up & Next Steps**: + + - Recap the power of Magic Mode and anti-bot features for handling restricted websites. + - Tease the next video: **Content Cleaning and Fit Markdown** to show how to extract clean and focused content from a page. + +--- + +This outline shows users how to easily avoid bot detection and access restricted content, demonstrating both the power and simplicity of Magic Mode in Crawl4AI. \ No newline at end of file diff --git a/docs/md_v2/tutorial/episode_07_Content_Cleaning_and_Fit_Markdown.md b/docs/md_v2/tutorial/episode_07_Content_Cleaning_and_Fit_Markdown.md new file mode 100644 index 0000000000000000000000000000000000000000..60ef9eea7535f6855478a1f675f668161d356e36 --- /dev/null +++ b/docs/md_v2/tutorial/episode_07_Content_Cleaning_and_Fit_Markdown.md @@ -0,0 +1,89 @@ +# Crawl4AI + +## Episode 7: Content Cleaning and Fit Markdown + +### Quick Intro +Explain content cleaning options, including `fit_markdown` to keep only the most relevant content. Demo: Extract and compare regular vs. fit markdown from a news site or blog. + +Here’s a streamlined outline for the **Content Cleaning and Fit Markdown** video: + +--- + +### **Content Cleaning & Fit Markdown** + +1) **Overview of Content Cleaning in Crawl4AI**: + + - Explain that web pages often include extra elements like ads, navigation bars, footers, and popups. + - Crawl4AI’s content cleaning features help extract only the main content, reducing noise and enhancing readability. + +2) **Basic Content Cleaning Options**: + + - **Removing Unwanted Elements**: Exclude specific HTML tags, like forms or navigation bars: + ```python + result = await crawler.arun( + url="https://example.com", + word_count_threshold=10, # Filter out blocks with fewer than 10 words + excluded_tags=['form', 'nav'], # Exclude specific tags + remove_overlay_elements=True # Remove popups and modals + ) + ``` + - This example extracts content while excluding forms, navigation, and modal overlays, ensuring clean results. + +3) **Fit Markdown for Main Content Extraction**: + + - **What is Fit Markdown**: Uses advanced analysis to identify the most relevant content (ideal for articles, blogs, and documentation). + - **How it Works**: Analyzes content density, removes boilerplate elements, and maintains formatting for a clear output. + - **Example**: + ```python + result = await crawler.arun(url="https://example.com") + main_content = result.fit_markdown # Extracted main content + print(main_content[:500]) # Display first 500 characters + ``` + - Fit Markdown is especially helpful for long-form content like news articles or blog posts. + +4) **Comparing Fit Markdown with Regular Markdown**: + + - **Fit Markdown** returns the primary content without extraneous elements. + - **Regular Markdown** includes all extracted text in markdown format. + - Example to show the difference: + ```python + all_content = result.markdown # Full markdown + main_content = result.fit_markdown # Only the main content + + print(f"All Content Length: {len(all_content)}") + print(f"Main Content Length: {len(main_content)}") + ``` + - This comparison shows the effectiveness of Fit Markdown in focusing on essential content. + +5) **Media and Metadata Handling with Content Cleaning**: + + - **Media Extraction**: Crawl4AI captures images and videos with metadata like alt text, descriptions, and relevance scores: + ```python + for image in result.media["images"]: + print(f"Source: {image['src']}, Alt Text: {image['alt']}, Relevance Score: {image['score']}") + ``` + - **Use Case**: Useful for saving only relevant images or videos from an article or content-heavy page. + +6) **Example of Clean Content Extraction in Action**: + + - Full example extracting cleaned content and Fit Markdown: + ```python + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + word_count_threshold=10, + excluded_tags=['nav', 'footer'], + remove_overlay_elements=True + ) + print(result.fit_markdown[:500]) # Show main content + ``` + - This example demonstrates content cleaning with settings for filtering noise and focusing on the core text. + +7) **Wrap Up & Next Steps**: + + - Summarize the power of Crawl4AI’s content cleaning features and Fit Markdown for capturing clean, relevant content. + - Tease the next video: **Link Analysis and Smart Filtering** to focus on analyzing and filtering links within crawled pages. + +--- + +This outline covers Crawl4AI’s content cleaning features and the unique benefits of Fit Markdown, showing users how to retrieve focused, high-quality content from web pages. \ No newline at end of file diff --git a/docs/md_v2/tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md b/docs/md_v2/tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md new file mode 100644 index 0000000000000000000000000000000000000000..c0daacadf8fe509c6522de3d1ac5670624e62a08 --- /dev/null +++ b/docs/md_v2/tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md @@ -0,0 +1,116 @@ +# Crawl4AI + +## Episode 8: Media Handling: Images, Videos, and Audio + +### Quick Intro +Showcase Crawl4AI’s media extraction capabilities, including lazy-loaded media and metadata. Demo: Crawl a multimedia page, extract images, and show metadata (alt text, context, relevance score). + +Here’s a clear and focused outline for the **Media Handling: Images, Videos, and Audio** video: + +--- + +### **Media Handling: Images, Videos, and Audio** + +1) **Overview of Media Extraction in Crawl4AI**: + + - Crawl4AI can detect and extract different types of media (images, videos, and audio) along with useful metadata. + - This functionality is essential for gathering visual content from multimedia-heavy pages like e-commerce sites, news articles, and social media feeds. + +2) **Image Extraction and Metadata**: + + - Crawl4AI captures images with detailed metadata, including: + - **Source URL**: The direct URL to the image. + - **Alt Text**: Image description if available. + - **Relevance Score**: A score (0–10) indicating how relevant the image is to the main content. + - **Context**: Text surrounding the image on the page. + - **Example**: + ```python + result = await crawler.arun(url="https://example.com") + + for image in result.media["images"]: + print(f"Source: {image['src']}") + print(f"Alt Text: {image['alt']}") + print(f"Relevance Score: {image['score']}") + print(f"Context: {image['context']}") + ``` + - This example shows how to access each image’s metadata, making it easy to filter for the most relevant visuals. + +3) **Handling Lazy-Loaded Images**: + + - Crawl4AI automatically supports lazy-loaded images, which are commonly used to optimize webpage loading. + - **Example with Wait for Lazy-Loaded Content**: + ```python + result = await crawler.arun( + url="https://example.com", + wait_for="css:img[data-src]", # Wait for lazy-loaded images + delay_before_return_html=2.0 # Allow extra time for images to load + ) + ``` + - This setup waits for lazy-loaded images to appear, ensuring they are fully captured. + +4) **Video Extraction and Metadata**: + + - Crawl4AI captures video elements, including: + - **Source URL**: The video’s direct URL. + - **Type**: Format of the video (e.g., MP4). + - **Thumbnail**: A poster or thumbnail image if available. + - **Duration**: Video length, if metadata is provided. + - **Example**: + ```python + for video in result.media["videos"]: + print(f"Video Source: {video['src']}") + print(f"Type: {video['type']}") + print(f"Thumbnail: {video.get('poster')}") + print(f"Duration: {video.get('duration')}") + ``` + - This allows users to gather video content and relevant details for further processing or analysis. + +5) **Audio Extraction and Metadata**: + + - Audio elements can also be extracted, with metadata like: + - **Source URL**: The audio file’s direct URL. + - **Type**: Format of the audio file (e.g., MP3). + - **Duration**: Length of the audio, if available. + - **Example**: + ```python + for audio in result.media["audios"]: + print(f"Audio Source: {audio['src']}") + print(f"Type: {audio['type']}") + print(f"Duration: {audio.get('duration')}") + ``` + - Useful for sites with podcasts, sound bites, or other audio content. + +6) **Filtering Media by Relevance**: + + - Use metadata like relevance score to filter only the most useful media content: + ```python + relevant_images = [img for img in result.media["images"] if img['score'] > 5] + ``` + - This is especially helpful for content-heavy pages where you only want media directly related to the main content. + +7) **Example: Full Media Extraction with Content Filtering**: + + - Full example extracting images, videos, and audio along with filtering by relevance: + ```python + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + word_count_threshold=10, # Filter content blocks for relevance + exclude_external_images=True # Only keep internal images + ) + + # Display media summaries + print(f"Relevant Images: {len(relevant_images)}") + print(f"Videos: {len(result.media['videos'])}") + print(f"Audio Clips: {len(result.media['audios'])}") + ``` + - This example shows how to capture and filter various media types, focusing on what’s most relevant. + +8) **Wrap Up & Next Steps**: + + - Recap the comprehensive media extraction capabilities, emphasizing how metadata helps users focus on relevant content. + - Tease the next video: **Link Analysis and Smart Filtering** to explore how Crawl4AI handles internal, external, and social media links for more focused data gathering. + +--- + +This outline provides users with a complete guide to handling images, videos, and audio in Crawl4AI, using metadata to enhance relevance and precision in multimedia extraction. diff --git a/docs/md_v2/tutorial/episode_09_Link_Analysis_and_Smart_Filtering.md b/docs/md_v2/tutorial/episode_09_Link_Analysis_and_Smart_Filtering.md new file mode 100644 index 0000000000000000000000000000000000000000..263d77bba96bca38a06a73ec22087d099d2eaf79 --- /dev/null +++ b/docs/md_v2/tutorial/episode_09_Link_Analysis_and_Smart_Filtering.md @@ -0,0 +1,95 @@ +# Crawl4AI + +## Episode 9: Link Analysis and Smart Filtering + +### Quick Intro +Walk through internal and external link classification, social media link filtering, and custom domain exclusion. Demo: Analyze links on a website, focusing on internal navigation vs. external or ad links. + +Here’s a focused outline for the **Link Analysis and Smart Filtering** video: + +--- + +### **Link Analysis & Smart Filtering** + +1) **Importance of Link Analysis in Web Crawling**: + + - Explain that web pages often contain numerous links, including internal links, external links, social media links, and ads. + - Crawl4AI’s link analysis and filtering options help extract only relevant links, enabling more targeted and efficient crawls. + +2) **Automatic Link Classification**: + + - Crawl4AI categorizes links automatically into internal, external, and social media links. + - **Example**: + ```python + result = await crawler.arun(url="https://example.com") + + # Access internal and external links + internal_links = result.links["internal"] + external_links = result.links["external"] + + # Print first few links for each type + print("Internal Links:", internal_links[:3]) + print("External Links:", external_links[:3]) + ``` + +3) **Filtering Out Unwanted Links**: + + - **Exclude External Links**: Remove all links pointing to external sites. + - **Exclude Social Media Links**: Filter out social media domains like Facebook or Twitter. + - **Example**: + ```python + result = await crawler.arun( + url="https://example.com", + exclude_external_links=True, # Remove external links + exclude_social_media_links=True # Remove social media links + ) + ``` + +4) **Custom Domain Filtering**: + + - **Exclude Specific Domains**: Filter links from particular domains, e.g., ad sites. + - **Custom Social Media Domains**: Add additional social media domains if needed. + - **Example**: + ```python + result = await crawler.arun( + url="https://example.com", + exclude_domains=["ads.com", "trackers.com"], + exclude_social_media_domains=["facebook.com", "linkedin.com"] + ) + ``` + +5) **Accessing Link Context and Metadata**: + + - Crawl4AI provides additional metadata for each link, including its text, type (e.g., navigation or content), and surrounding context. + - **Example**: + ```python + for link in result.links["internal"]: + print(f"Link: {link['href']}, Text: {link['text']}, Context: {link['context']}") + ``` + - **Use Case**: Helps users understand the relevance of links based on where they are placed on the page (e.g., navigation vs. article content). + +6) **Example of Comprehensive Link Filtering and Analysis**: + + - Full example combining link filtering, metadata access, and contextual information: + ```python + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + exclude_external_links=True, + exclude_social_media_links=True, + exclude_domains=["ads.com"], + css_selector=".main-content" # Focus only on main content area + ) + for link in result.links["internal"]: + print(f"Internal Link: {link['href']}, Text: {link['text']}, Context: {link['context']}") + ``` + - This example filters unnecessary links, keeping only internal and relevant links from the main content area. + +7) **Wrap Up & Next Steps**: + + - Summarize the benefits of link filtering for efficient crawling and relevant content extraction. + - Tease the next video: **Custom Headers, Identity Management, and User Simulation** to explain how to configure identity settings and simulate user behavior for stealthier crawls. + +--- + +This outline provides a practical overview of Crawl4AI’s link analysis and filtering features, helping users target only essential links while eliminating distractions. \ No newline at end of file diff --git a/docs/md_v2/tutorial/episode_10_Custom_Headers,_Identity,_and_User_Simulation.md b/docs/md_v2/tutorial/episode_10_Custom_Headers,_Identity,_and_User_Simulation.md new file mode 100644 index 0000000000000000000000000000000000000000..6eb928f0a9b11f1ad6c4f72a8c29640eb5294d50 --- /dev/null +++ b/docs/md_v2/tutorial/episode_10_Custom_Headers,_Identity,_and_User_Simulation.md @@ -0,0 +1,93 @@ +# Crawl4AI + +## Episode 10: Custom Headers, Identity, and User Simulation + +### Quick Intro +Teach how to use custom headers, user-agent strings, and simulate real user interactions. Demo: Set custom user-agent and headers to access a site that blocks typical crawlers. + +Here’s a concise outline for the **Custom Headers, Identity Management, and User Simulation** video: + +--- + +### **Custom Headers, Identity Management, & User Simulation** + +1) **Why Customize Headers and Identity in Crawling**: + + - Websites often track request headers and browser properties to detect bots. Customizing headers and managing identity help make requests appear more human, improving access to restricted sites. + +2) **Setting Custom Headers**: + + - Customize HTTP headers to mimic genuine browser requests or meet site-specific requirements: + ```python + headers = { + "Accept-Language": "en-US,en;q=0.9", + "X-Requested-With": "XMLHttpRequest", + "Cache-Control": "no-cache" + } + crawler = AsyncWebCrawler(headers=headers) + ``` + - **Use Case**: Customize the `Accept-Language` header to simulate local user settings, or `Cache-Control` to bypass cache for fresh content. + +3) **Setting a Custom User Agent**: + + - Some websites block requests from common crawler user agents. Setting a custom user agent string helps bypass these restrictions: + ```python + crawler = AsyncWebCrawler( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + ) + ``` + - **Tip**: Use user-agent strings from popular browsers (e.g., Chrome, Firefox) to improve access and reduce detection risks. + +4) **User Simulation for Human-like Behavior**: + + - Enable `simulate_user=True` to mimic natural user interactions, such as random timing and simulated mouse movements: + ```python + result = await crawler.arun( + url="https://example.com", + simulate_user=True # Simulates human-like behavior + ) + ``` + - **Behavioral Effects**: Adds subtle variations in interactions, making the crawler harder to detect on bot-protected sites. + +5) **Navigator Overrides and Magic Mode for Full Identity Masking**: + + - Use `override_navigator=True` to mask automation indicators like `navigator.webdriver`, which websites check to detect bots: + ```python + result = await crawler.arun( + url="https://example.com", + override_navigator=True # Masks bot-related signals + ) + ``` + - **Combining with Magic Mode**: For a complete anti-bot setup, combine these identity options with `magic=True` for maximum protection: + ```python + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + magic=True, # Enables all anti-bot detection features + user_agent="Custom-Agent", # Custom agent with Magic Mode + ) + ``` + - This setup includes all anti-detection techniques like navigator masking, random timing, and user simulation. + +6) **Example: Comprehensive Setup for Identity Management**: + + - A full example combining custom headers, user-agent, and user simulation for a realistic browsing profile: + ```python + async with AsyncWebCrawler( + headers={"Accept-Language": "en-US", "Cache-Control": "no-cache"}, + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0", + simulate_user=True + ) as crawler: + result = await crawler.arun(url="https://example.com/secure-page") + print(result.markdown[:500]) # Display extracted content + ``` + - This example enables detailed customization for evading detection and accessing protected pages smoothly. + +7) **Wrap Up & Next Steps**: + + - Recap the value of headers, user-agent customization, and simulation in bypassing bot detection. + - Tease the next video: **Extraction Strategies: JSON CSS, LLM, and Cosine** to dive into structured data extraction methods for high-quality content retrieval. + +--- + +This outline equips users with tools for managing crawler identity and human-like behavior, essential for accessing bot-protected or restricted websites. \ No newline at end of file diff --git a/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md b/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md new file mode 100644 index 0000000000000000000000000000000000000000..b460ff8c22acbc0391ed7a15c735190d542db585 --- /dev/null +++ b/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md @@ -0,0 +1,186 @@ +Here’s a detailed outline for the **JSON-CSS Extraction Strategy** video, covering all key aspects and supported structures in Crawl4AI: + +--- + +### **10.1 JSON-CSS Extraction Strategy** + +#### **1. Introduction to JSON-CSS Extraction** + - JSON-CSS Extraction is used for pulling structured data from pages with repeated patterns, like product listings, article feeds, or directories. + - This strategy allows defining a schema with CSS selectors and data fields, making it easy to capture nested, list-based, or singular elements. + +#### **2. Basic Schema Structure** + - **Schema Fields**: The schema has two main components: + - `baseSelector`: A CSS selector to locate the main elements you want to extract (e.g., each article or product block). + - `fields`: Defines the data fields for each element, supporting various data types and structures. + +#### **3. Simple Field Extraction** + - **Example HTML**: + ```html +
    +

    Sample Product

    + $19.99 +

    This is a sample product.

    +
    + ``` + - **Schema**: + ```python + schema = { + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": ".title", "type": "text"}, + {"name": "price", "selector": ".price", "type": "text"}, + {"name": "description", "selector": ".description", "type": "text"} + ] + } + ``` + - **Explanation**: Each field captures text content from specified CSS selectors within each `.product` element. + +#### **4. Supported Field Types: Text, Attribute, HTML, Regex** + - **Field Type Options**: + - `text`: Extracts visible text. + - `attribute`: Captures an HTML attribute (e.g., `src`, `href`). + - `html`: Extracts the raw HTML of an element. + - `regex`: Allows regex patterns to extract part of the text. + + - **Example HTML** (including an image): + ```html +
    +

    Sample Product

    + Product Image + $19.99 +

    Limited time offer.

    +
    + ``` + - **Schema**: + ```python + schema = { + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": ".title", "type": "text"}, + {"name": "image_url", "selector": ".product-image", "type": "attribute", "attribute": "src"}, + {"name": "price", "selector": ".price", "type": "regex", "pattern": r"\$(\d+\.\d+)"}, + {"name": "description_html", "selector": ".description", "type": "html"} + ] + } + ``` + - **Explanation**: + - `attribute`: Extracts the `src` attribute from `.product-image`. + - `regex`: Extracts the numeric part from `$19.99`. + - `html`: Retrieves the full HTML of the description element. + +#### **5. Nested Field Extraction** + - **Use Case**: Useful when content contains sub-elements, such as an article with author details within it. + - **Example HTML**: + ```html +
    +

    Sample Article

    +
    + John Doe + Writer and editor +
    +
    + ``` + - **Schema**: + ```python + schema = { + "baseSelector": ".article", + "fields": [ + {"name": "title", "selector": ".title", "type": "text"}, + {"name": "author", "type": "nested", "selector": ".author", "fields": [ + {"name": "name", "selector": ".name", "type": "text"}, + {"name": "bio", "selector": ".bio", "type": "text"} + ]} + ] + } + ``` + - **Explanation**: + - `nested`: Extracts `name` and `bio` within `.author`, grouping the author details in a single `author` object. + +#### **6. List and Nested List Extraction** + - **List**: Extracts multiple elements matching the selector as a list. + - **Nested List**: Allows lists within lists, useful for items with sub-lists (e.g., specifications for each product). + - **Example HTML**: + ```html +
    +

    Product with Features

    +
      +
    • Feature 1
    • +
    • Feature 2
    • +
    • Feature 3
    • +
    +
    + ``` + - **Schema**: + ```python + schema = { + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": ".title", "type": "text"}, + {"name": "features", "type": "list", "selector": ".features .feature", "fields": [ + {"name": "feature", "type": "text"} + ]} + ] + } + ``` + - **Explanation**: + - `list`: Captures each `.feature` item within `.features`, outputting an array of features under the `features` field. + +#### **7. Transformations for Field Values** + - Transformations allow you to modify extracted values (e.g., converting to lowercase). + - Supported transformations: `lowercase`, `uppercase`, `strip`. + - **Example HTML**: + ```html +
    +

    Special Product

    +
    + ``` + - **Schema**: + ```python + schema = { + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": ".title", "type": "text", "transform": "uppercase"} + ] + } + ``` + - **Explanation**: The `transform` property changes the `title` to uppercase, useful for standardized outputs. + +#### **8. Full JSON-CSS Extraction Example** + - Combining all elements in a single schema example for a comprehensive crawl: + - **Example HTML**: + ```html +
    +

    Featured Product

    + + $99.99 +

    Best product of the year.

    +
      +
    • Durable
    • +
    • Eco-friendly
    • +
    +
    + ``` + - **Schema**: + ```python + schema = { + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": ".title", "type": "text", "transform": "uppercase"}, + {"name": "image_url", "selector": ".product-image", "type": "attribute", "attribute": "src"}, + {"name": "price", "selector": ".price", "type": "regex", "pattern": r"\$(\d+\.\d+)"}, + {"name": "description", "selector": ".description", "type": "html"}, + {"name": "features", "type": "list", "selector": ".features .feature", "fields": [ + {"name": "feature", "type": "text"} + ]} + ] + } + ``` + - **Explanation**: This schema captures and transforms each aspect of the product, illustrating the JSON-CSS strategy’s versatility for structured extraction. + +#### **9. Wrap Up & Next Steps** + - Summarize JSON-CSS Extraction’s flexibility for structured, pattern-based extraction. + - Tease the next video: **10.2 LLM Extraction Strategy**, focusing on using language models to extract data based on intelligent content analysis. + +--- + +This outline covers each JSON-CSS Extraction option in Crawl4AI, with practical examples and schema configurations, making it a thorough guide for users. diff --git a/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md b/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md new file mode 100644 index 0000000000000000000000000000000000000000..a9f00e92115ae3672b938032eb54c61515d361b5 --- /dev/null +++ b/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md @@ -0,0 +1,153 @@ +# Crawl4AI + +## Episode 11: Extraction Strategies: JSON CSS, LLM, and Cosine + +### Quick Intro +Introduce JSON CSS Extraction Strategy for structured data, LLM Extraction Strategy for intelligent parsing, and Cosine Strategy for clustering similar content. Demo: Use JSON CSS to scrape product details from an e-commerce site. + +Here’s a comprehensive outline for the **LLM Extraction Strategy** video, covering key details and example applications. + +--- + +### **10.2 LLM Extraction Strategy** + +#### **1. Introduction to LLM Extraction Strategy** + - The LLM Extraction Strategy leverages language models to interpret and extract structured data from complex web content. + - Unlike traditional CSS selectors, this strategy uses natural language instructions and schemas to guide the extraction, ideal for unstructured or diverse content. + - Supports **OpenAI**, **Azure OpenAI**, **HuggingFace**, and **Ollama** models, enabling flexibility with both proprietary and open-source providers. + +#### **2. Key Components of LLM Extraction Strategy** + - **Provider**: Specifies the LLM provider (e.g., OpenAI, HuggingFace, Azure). + - **API Token**: Required for most providers, except Ollama (local LLM model). + - **Instruction**: Custom extraction instructions sent to the model, providing flexibility in how the data is structured and extracted. + - **Schema**: Optional, defines structured fields to organize extracted data into JSON format. + - **Extraction Type**: Supports `"block"` for simpler text blocks or `"schema"` when a structured output format is required. + - **Chunking Parameters**: Breaks down large documents, with options to adjust chunk size and overlap rate for more accurate extraction across lengthy texts. + +#### **3. Basic Extraction Example: OpenAI Model Pricing** + - **Goal**: Extract model names and their input and output fees from the OpenAI pricing page. + - **Schema Definition**: + - **Model Name**: Text for model identification. + - **Input Fee**: Token cost for input processing. + - **Output Fee**: Token cost for output generation. + + - **Schema**: + ```python + class OpenAIModelFee(BaseModel): + model_name: str = Field(..., description="Name of the OpenAI model.") + input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") + output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") + ``` + + - **Example Code**: + ```python + async def extract_openai_pricing(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://openai.com/api/pricing/", + extraction_strategy=LLMExtractionStrategy( + provider="openai/gpt-4o", + api_token=os.getenv("OPENAI_API_KEY"), + schema=OpenAIModelFee.schema(), + extraction_type="schema", + instruction="Extract model names and fees for input and output tokens from the page." + ), + cache_mode=CacheMode.BYPASS + ) + print(result.extracted_content) + ``` + + - **Explanation**: + - The extraction strategy combines a schema and detailed instruction to guide the LLM in capturing structured data. + - Each model’s name, input fee, and output fee are extracted in a JSON format. + +#### **4. Knowledge Graph Extraction Example** + - **Goal**: Extract entities and their relationships from a document for use in a knowledge graph. + - **Schema Definition**: + - **Entities**: Individual items with descriptions (e.g., people, organizations). + - **Relationships**: Connections between entities, including descriptions and relationship types. + + - **Schema**: + ```python + class Entity(BaseModel): + name: str + description: str + + class Relationship(BaseModel): + entity1: Entity + entity2: Entity + description: str + relation_type: str + + class KnowledgeGraph(BaseModel): + entities: List[Entity] + relationships: List[Relationship] + ``` + + - **Example Code**: + ```python + async def extract_knowledge_graph(): + extraction_strategy = LLMExtractionStrategy( + provider="azure/gpt-4o-mini", + api_token=os.getenv("AZURE_API_KEY"), + schema=KnowledgeGraph.schema(), + extraction_type="schema", + instruction="Extract entities and relationships from the content to build a knowledge graph." + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/some-article", + extraction_strategy=extraction_strategy, + cache_mode=CacheMode.BYPASS + ) + print(result.extracted_content) + ``` + + - **Explanation**: + - In this setup, the LLM extracts entities and their relationships based on the schema and instruction. + - The schema organizes results into a JSON-based knowledge graph format. + +#### **5. Key Settings in LLM Extraction** + - **Chunking Options**: + - For long pages, set `chunk_token_threshold` to specify maximum token count per section. + - Adjust `overlap_rate` to control the overlap between chunks, useful for contextual consistency. + - **Example**: + ```python + extraction_strategy = LLMExtractionStrategy( + provider="openai/gpt-4", + api_token=os.getenv("OPENAI_API_KEY"), + chunk_token_threshold=3000, + overlap_rate=0.2, # 20% overlap between chunks + instruction="Extract key insights and relationships." + ) + ``` + - This setup ensures that longer texts are divided into manageable chunks with slight overlap, enhancing the quality of extraction. + +#### **6. Flexible Provider Options for LLM Extraction** + - **Using Proprietary Models**: OpenAI, Azure, and HuggingFace provide robust language models, often suited for complex or detailed extractions. + - **Using Open-Source Models**: Ollama and other open-source models can be deployed locally, suitable for offline or cost-effective extraction. + - **Example Call**: + ```python + await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) + await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + await extract_structured_data_using_llm("ollama/llama3.2") + ``` + +#### **7. Complete Example of LLM Extraction Setup** + - Code to run both the OpenAI pricing and Knowledge Graph extractions, using various providers: + ```python + async def main(): + await extract_openai_pricing() + await extract_knowledge_graph() + + if __name__ == "__main__": + asyncio.run(main()) + ``` + +#### **8. Wrap Up & Next Steps** + - Recap the power of LLM extraction for handling unstructured or complex data extraction tasks. + - Tease the next video: **10.3 Cosine Similarity Strategy** for clustering similar content based on semantic similarity. + +--- + +This outline explains LLM Extraction in Crawl4AI, with examples showing how to extract structured data using custom schemas and instructions. It demonstrates flexibility with multiple providers, ensuring practical application for different use cases. diff --git a/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md b/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md new file mode 100644 index 0000000000000000000000000000000000000000..6100ae4ca7a29e0ec5ba3119ba7d78b20fcb5e94 --- /dev/null +++ b/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md @@ -0,0 +1,136 @@ +# Crawl4AI + +## Episode 11: Extraction Strategies: JSON CSS, LLM, and Cosine + +### Quick Intro +Introduce JSON CSS Extraction Strategy for structured data, LLM Extraction Strategy for intelligent parsing, and Cosine Strategy for clustering similar content. Demo: Use JSON CSS to scrape product details from an e-commerce site. + +Here’s a structured outline for the **Cosine Similarity Strategy** video, covering key concepts, configuration, and a practical example. + +--- + +### **10.3 Cosine Similarity Strategy** + +#### **1. Introduction to Cosine Similarity Strategy** + - The Cosine Similarity Strategy clusters content by semantic similarity, offering an efficient alternative to LLM-based extraction, especially when speed is a priority. + - Ideal for grouping similar sections of text, this strategy is well-suited for pages with content sections that may need to be classified or tagged, like news articles, product descriptions, or reviews. + +#### **2. Key Configuration Options** + - **semantic_filter**: A keyword-based filter to focus on relevant content. + - **word_count_threshold**: Minimum number of words per cluster, filtering out shorter, less meaningful clusters. + - **max_dist**: Maximum allowable distance between elements in clusters, impacting cluster tightness. + - **linkage_method**: Method for hierarchical clustering, such as `'ward'` (for well-separated clusters). + - **top_k**: Specifies the number of top categories for each cluster. + - **model_name**: Defines the model for embeddings, such as `sentence-transformers/all-MiniLM-L6-v2`. + - **sim_threshold**: Minimum similarity threshold for filtering, allowing control over cluster relevance. + +#### **3. How Cosine Similarity Clustering Works** + - **Step 1**: Embeddings are generated for each text section, transforming them into vectors that capture semantic meaning. + - **Step 2**: Hierarchical clustering groups similar sections based on cosine similarity, forming clusters with related content. + - **Step 3**: Clusters are filtered based on word count, removing those below the `word_count_threshold`. + - **Step 4**: Each cluster is then categorized with tags, if enabled, providing context to each grouped content section. + +#### **4. Example Use Case: Clustering Blog Article Sections** + - **Goal**: Group related sections of a blog or news page to identify distinct topics or discussion areas. + - **Example HTML Sections**: + ```text + "The economy is showing signs of recovery, with markets up this quarter.", + "In the sports world, several major teams are preparing for the upcoming season.", + "New advancements in AI technology are reshaping the tech landscape.", + "Market analysts are optimistic about continued growth in tech stocks." + ``` + + - **Code Setup**: + ```python + async def extract_blog_sections(): + extraction_strategy = CosineStrategy( + word_count_threshold=15, + max_dist=0.3, + sim_threshold=0.2, + model_name="sentence-transformers/all-MiniLM-L6-v2", + top_k=2 + ) + async with AsyncWebCrawler() as crawler: + url = "https://example.com/blog-page" + result = await crawler.arun( + url=url, + extraction_strategy=extraction_strategy, + cache_mode=CacheMode.BYPASS + ) + print(result.extracted_content) + ``` + + - **Explanation**: + - **word_count_threshold**: Ensures only clusters with meaningful content are included. + - **sim_threshold**: Filters out clusters with low similarity, focusing on closely related sections. + - **top_k**: Selects top tags, useful for identifying main topics. + +#### **5. Applying Semantic Filtering with Cosine Similarity** + - **Semantic Filter**: Filters sections based on relevance to a specific keyword, such as “technology” for tech articles. + - **Example Code**: + ```python + extraction_strategy = CosineStrategy( + semantic_filter="technology", + word_count_threshold=10, + max_dist=0.25, + model_name="sentence-transformers/all-MiniLM-L6-v2" + ) + ``` + - **Explanation**: + - **semantic_filter**: Only sections with high similarity to the “technology” keyword will be included in the clustering, making it easy to focus on specific topics within a mixed-content page. + +#### **6. Clustering Product Reviews by Similarity** + - **Goal**: Organize product reviews by themes, such as “price,” “quality,” or “durability.” + - **Example Reviews**: + ```text + "The quality of this product is outstanding and well worth the price.", + "I found the product to be durable but a bit overpriced.", + "Great value for the money and long-lasting.", + "The build quality is good, but I expected a lower price point." + ``` + + - **Code Setup**: + ```python + async def extract_product_reviews(): + extraction_strategy = CosineStrategy( + word_count_threshold=20, + max_dist=0.35, + sim_threshold=0.25, + model_name="sentence-transformers/all-MiniLM-L6-v2" + ) + async with AsyncWebCrawler() as crawler: + url = "https://example.com/product-reviews" + result = await crawler.arun( + url=url, + extraction_strategy=extraction_strategy, + cache_mode=CacheMode.BYPASS + ) + print(result.extracted_content) + ``` + + - **Explanation**: + - This configuration clusters similar reviews, grouping feedback by common themes, helping businesses understand customer sentiments around particular product aspects. + +#### **7. Performance Advantages of Cosine Strategy** + - **Speed**: The Cosine Similarity Strategy is faster than LLM-based extraction, as it doesn’t rely on API calls to external LLMs. + - **Local Processing**: The strategy runs locally with pre-trained sentence embeddings, ideal for high-throughput scenarios where cost and latency are concerns. + - **Comparison**: With a well-optimized local model, this method can perform clustering on large datasets quickly, making it suitable for tasks requiring rapid, repeated analysis. + +#### **8. Full Code Example for Clustering News Articles** + - **Code**: + ```python + async def main(): + await extract_blog_sections() + await extract_product_reviews() + + if __name__ == "__main__": + asyncio.run(main()) + ``` + +#### **9. Wrap Up & Next Steps** + - Recap the efficiency and effectiveness of Cosine Similarity for clustering related content quickly. + - Close with a reminder of Crawl4AI’s flexibility across extraction strategies, and prompt users to experiment with different settings to optimize clustering for their specific content. + +--- + +This outline covers Cosine Similarity Strategy’s speed and effectiveness, providing examples that showcase its potential for clustering various content types efficiently. diff --git a/docs/md_v2/tutorial/episode_12_Session-Based_Crawling_for_Dynamic_Websites.md b/docs/md_v2/tutorial/episode_12_Session-Based_Crawling_for_Dynamic_Websites.md new file mode 100644 index 0000000000000000000000000000000000000000..d1ab813d0e49598f164fb43072df20cd992dc86d --- /dev/null +++ b/docs/md_v2/tutorial/episode_12_Session-Based_Crawling_for_Dynamic_Websites.md @@ -0,0 +1,140 @@ +# Crawl4AI + +## Episode 12: Session-Based Crawling for Dynamic Websites + +### Quick Intro +Show session management for handling websites with multiple pages or actions (like “load more” buttons). Demo: Crawl a paginated content page, persisting session data across multiple requests. + +Here’s a detailed outline for the **Session-Based Crawling for Dynamic Websites** video, explaining why sessions are necessary, how to use them, and providing practical examples and a visual diagram to illustrate the concept. + +--- + +### **11. Session-Based Crawling for Dynamic Websites** + +#### **1. Introduction to Session-Based Crawling** + - **What is Session-Based Crawling**: Session-based crawling maintains a continuous browsing session across multiple page states, allowing the crawler to interact with a page and retrieve content that loads dynamically or based on user interactions. + - **Why It’s Needed**: + - In static pages, all content is available directly from a single URL. + - In dynamic websites, content often loads progressively or based on user actions (e.g., clicking “load more,” submitting forms, scrolling). + - Session-based crawling helps simulate user actions, capturing content that is otherwise hidden until specific actions are taken. + +#### **2. Conceptual Diagram for Session-Based Crawling** + + ```mermaid + graph TD + Start[Start Session] --> S1[Initial State (S1)] + S1 -->|Crawl| Content1[Extract Content S1] + S1 -->|Action: Click Load More| S2[State S2] + S2 -->|Crawl| Content2[Extract Content S2] + S2 -->|Action: Scroll Down| S3[State S3] + S3 -->|Crawl| Content3[Extract Content S3] + S3 -->|Action: Submit Form| S4[Final State] + S4 -->|Crawl| Content4[Extract Content S4] + Content4 --> End[End Session] + ``` + + - **Explanation of Diagram**: + - **Start**: Initializes the session and opens the starting URL. + - **State Transitions**: Each action (e.g., clicking “load more,” scrolling) transitions to a new state, where additional content becomes available. + - **Session Persistence**: Keeps the same browsing session active, preserving the state and allowing for a sequence of actions to unfold. + - **End**: After reaching the final state, the session ends, and all accumulated content has been extracted. + +#### **3. Key Components of Session-Based Crawling in Crawl4AI** + - **Session ID**: A unique identifier to maintain the state across requests, allowing the crawler to “remember” previous actions. + - **JavaScript Execution**: Executes JavaScript commands (e.g., clicks, scrolls) to simulate interactions. + - **Wait Conditions**: Ensures the crawler waits for content to load in each state before moving on. + - **Sequential State Transitions**: By defining actions and wait conditions between states, the crawler can navigate through the page as a user would. + +#### **4. Basic Session Example: Multi-Step Content Loading** + - **Goal**: Crawl an article feed that requires several “load more” clicks to display additional content. + - **Code**: + ```python + async def crawl_article_feed(): + async with AsyncWebCrawler() as crawler: + session_id = "feed_session" + + for page in range(3): + result = await crawler.arun( + url="https://example.com/articles", + session_id=session_id, + js_code="document.querySelector('.load-more-button').click();" if page > 0 else None, + wait_for="css:.article", + css_selector=".article" # Target article elements + ) + print(f"Page {page + 1}: Extracted {len(result.extracted_content)} articles") + ``` + - **Explanation**: + - **session_id**: Ensures all requests share the same browsing state. + - **js_code**: Clicks the “load more” button after the initial page load, expanding content on each iteration. + - **wait_for**: Ensures articles have loaded after each click before extraction. + +#### **5. Advanced Example: E-Commerce Product Search with Filter Selection** + - **Goal**: Interact with filters on an e-commerce page to extract products based on selected criteria. + - **Example Steps**: + 1. **State 1**: Load the main product page. + 2. **State 2**: Apply a filter (e.g., “On Sale”) by selecting a checkbox. + 3. **State 3**: Scroll to load additional products and capture updated results. + + - **Code**: + ```python + async def extract_filtered_products(): + async with AsyncWebCrawler() as crawler: + session_id = "product_session" + + # Step 1: Open product page + result = await crawler.arun( + url="https://example.com/products", + session_id=session_id, + wait_for="css:.product-item" + ) + + # Step 2: Apply filter (e.g., "On Sale") + result = await crawler.arun( + url="https://example.com/products", + session_id=session_id, + js_code="document.querySelector('#sale-filter-checkbox').click();", + wait_for="css:.product-item" + ) + + # Step 3: Scroll to load additional products + for _ in range(2): # Scroll down twice + result = await crawler.arun( + url="https://example.com/products", + session_id=session_id, + js_code="window.scrollTo(0, document.body.scrollHeight);", + wait_for="css:.product-item" + ) + print(f"Loaded {len(result.extracted_content)} products after scroll") + ``` + - **Explanation**: + - **State Persistence**: Each action (filter selection and scroll) builds on the previous session state. + - **Multiple Interactions**: Combines clicking a filter with scrolling, demonstrating how the session preserves these actions. + +#### **6. Key Benefits of Session-Based Crawling** + - **Accessing Hidden Content**: Retrieves data that loads only after user actions. + - **Simulating User Behavior**: Handles interactive elements such as “load more” buttons, dropdowns, and filters. + - **Maintaining Continuity Across States**: Enables a sequential process, moving logically from one state to the next, capturing all desired content without reloading the initial state each time. + +#### **7. Additional Configuration Tips** + - **Manage Session End**: Always conclude the session after the final state to release resources. + - **Optimize with Wait Conditions**: Use `wait_for` to ensure complete loading before each extraction. + - **Handling Errors in Session-Based Crawling**: Include error handling for interactions that may fail, ensuring robustness across state transitions. + +#### **8. Complete Code Example: Multi-Step Session Workflow** + - **Example**: + ```python + async def main(): + await crawl_article_feed() + await extract_filtered_products() + + if __name__ == "__main__": + asyncio.run(main()) + ``` + +#### **9. Wrap Up & Next Steps** + - Recap the usefulness of session-based crawling for dynamic content extraction. + - Tease the next video: **Hooks and Custom Workflow with AsyncWebCrawler** to cover advanced customization options for further control over the crawling process. + +--- + +This outline covers session-based crawling from both a conceptual and practical perspective, helping users understand its importance, configure it effectively, and use it to handle complex dynamic content. \ No newline at end of file diff --git a/docs/md_v2/tutorial/episode_13_Chunking_Strategies_for_Large_Text_Processing.md b/docs/md_v2/tutorial/episode_13_Chunking_Strategies_for_Large_Text_Processing.md new file mode 100644 index 0000000000000000000000000000000000000000..eda07e8ba021b3685029801e12d88f58e28f0ef9 --- /dev/null +++ b/docs/md_v2/tutorial/episode_13_Chunking_Strategies_for_Large_Text_Processing.md @@ -0,0 +1,138 @@ +# Crawl4AI + +## Episode 13: Chunking Strategies for Large Text Processing + +### Quick Intro +Explain Regex, NLP, and Fixed-Length chunking, and when to use each. Demo: Chunk a large article or document for processing by topics or sentences. + +Here’s a structured outline for the **Chunking Strategies for Large Text Processing** video, emphasizing how chunking works within extraction and why it’s crucial for effective data aggregation. + +Here’s a structured outline for the **Chunking Strategies for Large Text Processing** video, explaining each strategy, when to use it, and providing examples to illustrate. + +--- + +### **12. Chunking Strategies for Large Text Processing** + +#### **1. Introduction to Chunking in Crawl4AI** + - **What is Chunking**: Chunking is the process of dividing large text into manageable sections or “chunks,” enabling efficient processing in extraction tasks. + - **Why It’s Needed**: + - When processing large text, feeding it directly into an extraction function (like `F(x)`) can overwhelm memory or token limits. + - Chunking breaks down `x` (the text) into smaller pieces, which are processed sequentially or in parallel by the extraction function, with the final result being an aggregation of all chunks’ processed output. + +#### **2. Key Chunking Strategies and Use Cases** + - Crawl4AI offers various chunking strategies to suit different text structures, chunk sizes, and processing requirements. + - **Choosing a Strategy**: Select based on the type of text (e.g., articles, transcripts) and extraction needs (e.g., simple splitting or context-sensitive processing). + +#### **3. Strategy 1: Regex-Based Chunking** + - **Description**: Uses regular expressions to split text based on specified patterns (e.g., paragraphs or section breaks). + - **Use Case**: Ideal for dividing text by paragraphs or larger logical blocks where sections are clearly separated by line breaks or punctuation. + - **Example**: + - **Pattern**: `r'\n\n'` for double line breaks. + ```python + chunker = RegexChunking(patterns=[r'\n\n']) + text_chunks = chunker.chunk(long_text) + print(text_chunks) # Output: List of paragraphs + ``` + - **Pros**: Flexible for pattern-based chunking. + - **Cons**: Limited to text with consistent formatting. + +#### **4. Strategy 2: NLP Sentence-Based Chunking** + - **Description**: Uses NLP to split text by sentences, ensuring grammatically complete segments. + - **Use Case**: Useful for extracting individual statements, such as in news articles, quotes, or legal text. + - **Example**: + ```python + chunker = NlpSentenceChunking() + sentence_chunks = chunker.chunk(long_text) + print(sentence_chunks) # Output: List of sentences + ``` + - **Pros**: Maintains sentence structure, ideal for tasks needing semantic completeness. + - **Cons**: May create very small chunks, which could limit contextual extraction. + +#### **5. Strategy 3: Topic-Based Segmentation Using TextTiling** + - **Description**: Segments text into topics using TextTiling, identifying topic shifts and key segments. + - **Use Case**: Ideal for long articles, reports, or essays where each section covers a different topic. + - **Example**: + ```python + chunker = TopicSegmentationChunking(num_keywords=3) + topic_chunks = chunker.chunk_with_topics(long_text) + print(topic_chunks) # Output: List of topic segments with keywords + ``` + - **Pros**: Groups related content, preserving topical coherence. + - **Cons**: Depends on identifiable topic shifts, which may not be present in all texts. + +#### **6. Strategy 4: Fixed-Length Word Chunking** + - **Description**: Splits text into chunks based on a fixed number of words. + - **Use Case**: Ideal for text where exact segment size is required, such as processing word-limited documents for LLMs. + - **Example**: + ```python + chunker = FixedLengthWordChunking(chunk_size=100) + word_chunks = chunker.chunk(long_text) + print(word_chunks) # Output: List of 100-word chunks + ``` + - **Pros**: Ensures uniform chunk sizes, suitable for token-based extraction limits. + - **Cons**: May split sentences, affecting semantic coherence. + +#### **7. Strategy 5: Sliding Window Chunking** + - **Description**: Uses a fixed window size with a step, creating overlapping chunks to maintain context. + - **Use Case**: Useful for maintaining context across sections, as with documents where context is needed for neighboring sections. + - **Example**: + ```python + chunker = SlidingWindowChunking(window_size=100, step=50) + window_chunks = chunker.chunk(long_text) + print(window_chunks) # Output: List of overlapping word chunks + ``` + - **Pros**: Retains context across adjacent chunks, ideal for complex semantic extraction. + - **Cons**: Overlap increases data size, potentially impacting processing time. + +#### **8. Strategy 6: Overlapping Window Chunking** + - **Description**: Similar to sliding windows but with a defined overlap, allowing chunks to share content at the edges. + - **Use Case**: Suitable for handling long texts with essential overlapping information, like research articles or medical records. + - **Example**: + ```python + chunker = OverlappingWindowChunking(window_size=1000, overlap=100) + overlap_chunks = chunker.chunk(long_text) + print(overlap_chunks) # Output: List of overlapping chunks with defined overlap + ``` + - **Pros**: Allows controlled overlap for consistent content coverage across chunks. + - **Cons**: Redundant data in overlapping areas may increase computation. + +#### **9. Practical Example: Using Chunking with an Extraction Strategy** + - **Goal**: Combine chunking with an extraction strategy to process large text effectively. + - **Example Code**: + ```python + from crawl4ai.extraction_strategy import LLMExtractionStrategy + + async def extract_large_text(): + # Initialize chunker and extraction strategy + chunker = FixedLengthWordChunking(chunk_size=200) + extraction_strategy = LLMExtractionStrategy(provider="openai/gpt-4", api_token="your_api_token") + + # Split text into chunks + text_chunks = chunker.chunk(large_text) + + async with AsyncWebCrawler() as crawler: + for chunk in text_chunks: + result = await crawler.arun( + url="https://example.com", + extraction_strategy=extraction_strategy, + content=chunk + ) + print(result.extracted_content) + ``` + + - **Explanation**: + - `chunker.chunk()`: Divides the `large_text` into smaller segments based on the chosen strategy. + - `extraction_strategy`: Processes each chunk separately, and results are then aggregated to form the final output. + +#### **10. Choosing the Right Chunking Strategy** + - **Text Structure**: If text has clear sections (e.g., paragraphs, topics), use Regex or Topic Segmentation. + - **Extraction Needs**: If context is crucial, consider Sliding or Overlapping Window Chunking. + - **Processing Constraints**: For word-limited extractions (e.g., LLMs with token limits), Fixed-Length Word Chunking is often most effective. + +#### **11. Wrap Up & Next Steps** + - Recap the benefits of each chunking strategy and when to use them in extraction workflows. + - Tease the next video: **Hooks and Custom Workflow with AsyncWebCrawler**, focusing on customizing crawler behavior with hooks for a fine-tuned extraction process. + +--- + +This outline provides a complete understanding of chunking strategies, explaining each method’s strengths and best-use scenarios to help users process large texts effectively in Crawl4AI. \ No newline at end of file diff --git a/docs/md_v2/tutorial/episode_14_Hooks_and_Custom_Workflow_with_AsyncWebCrawler.md b/docs/md_v2/tutorial/episode_14_Hooks_and_Custom_Workflow_with_AsyncWebCrawler.md new file mode 100644 index 0000000000000000000000000000000000000000..87a3d217c8031fddd162d1417843e11305a45d47 --- /dev/null +++ b/docs/md_v2/tutorial/episode_14_Hooks_and_Custom_Workflow_with_AsyncWebCrawler.md @@ -0,0 +1,185 @@ +# Crawl4AI + +## Episode 14: Hooks and Custom Workflow with AsyncWebCrawler + +### Quick Intro +Cover hooks (`on_browser_created`, `before_goto`, `after_goto`) to add custom workflows. Demo: Use hooks to add custom cookies or headers, log HTML, or trigger specific events on page load. + +Here’s a detailed outline for the **Hooks and Custom Workflow with AsyncWebCrawler** video, covering each hook’s purpose, usage, and example implementations. + +--- + +### **13. Hooks and Custom Workflow with AsyncWebCrawler** + +#### **1. Introduction to Hooks in Crawl4AI** + - **What are Hooks**: Hooks are customizable entry points in the crawling process that allow users to inject custom actions or logic at specific stages. + - **Why Use Hooks**: + - They enable fine-grained control over the crawling workflow. + - Useful for performing additional tasks (e.g., logging, modifying headers) dynamically during the crawl. + - Hooks provide the flexibility to adapt the crawler to complex site structures or unique project needs. + +#### **2. Overview of Available Hooks** + - Crawl4AI offers seven key hooks to modify and control different stages in the crawling lifecycle: + - `on_browser_created` + - `on_user_agent_updated` + - `on_execution_started` + - `before_goto` + - `after_goto` + - `before_return_html` + - `before_retrieve_html` + +#### **3. Hook-by-Hook Explanation and Examples** + +--- + +##### **Hook 1: `on_browser_created`** + - **Purpose**: Triggered right after the browser instance is created. + - **Use Case**: + - Initializing browser-specific settings or performing setup actions. + - Configuring browser extensions or scripts before any page is opened. + - **Example**: + ```python + async def log_browser_creation(browser): + print("Browser instance created:", browser) + + crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation) + ``` + - **Explanation**: This hook logs the browser creation event, useful for tracking when a new browser instance starts. + +--- + +##### **Hook 2: `on_user_agent_updated`** + - **Purpose**: Called whenever the user agent string is updated. + - **Use Case**: + - Modifying the user agent based on page requirements, e.g., changing to a mobile user agent for mobile-only pages. + - **Example**: + ```python + def update_user_agent(user_agent): + print(f"User Agent Updated: {user_agent}") + + crawler.crawler_strategy.set_hook('on_user_agent_updated', update_user_agent) + crawler.update_user_agent("Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X)") + ``` + - **Explanation**: This hook provides a callback every time the user agent changes, helpful for debugging or dynamically altering user agent settings based on conditions. + +--- + +##### **Hook 3: `on_execution_started`** + - **Purpose**: Called right before the crawler begins any interaction (e.g., JavaScript execution, clicks). + - **Use Case**: + - Performing setup actions, such as inserting cookies or initiating custom scripts. + - **Example**: + ```python + async def log_execution_start(page): + print("Execution started on page:", page.url) + + crawler.crawler_strategy.set_hook('on_execution_started', log_execution_start) + ``` + - **Explanation**: Logs the start of any major interaction on the page, ideal for cases where you want to monitor each interaction. + +--- + +##### **Hook 4: `before_goto`** + - **Purpose**: Triggered before navigating to a new URL with `page.goto()`. + - **Use Case**: + - Modifying request headers or setting up conditions right before the page loads. + - Adding headers or dynamically adjusting options for specific URLs. + - **Example**: + ```python + async def modify_headers_before_goto(page): + await page.set_extra_http_headers({"X-Custom-Header": "CustomValue"}) + print("Custom headers set before navigation") + + crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto) + ``` + - **Explanation**: This hook allows injecting headers or altering settings based on the page’s needs, particularly useful for pages with custom requirements. + +--- + +##### **Hook 5: `after_goto`** + - **Purpose**: Executed immediately after a page has loaded (after `page.goto()`). + - **Use Case**: + - Checking the loaded page state, modifying the DOM, or performing post-navigation actions (e.g., scrolling). + - **Example**: + ```python + async def post_navigation_scroll(page): + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + print("Scrolled to the bottom after navigation") + + crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll) + ``` + - **Explanation**: This hook scrolls to the bottom of the page after loading, which can help load dynamically added content like infinite scroll elements. + +--- + +##### **Hook 6: `before_return_html`** + - **Purpose**: Called right before HTML content is retrieved and returned. + - **Use Case**: + - Removing overlays or cleaning up the page for a cleaner HTML extraction. + - **Example**: + ```python + async def remove_advertisements(page, html): + await page.evaluate("document.querySelectorAll('.ad-banner').forEach(el => el.remove());") + print("Advertisements removed before returning HTML") + + crawler.crawler_strategy.set_hook('before_return_html', remove_advertisements) + ``` + - **Explanation**: The hook removes ad banners from the HTML before it’s retrieved, ensuring a cleaner data extraction. + +--- + +##### **Hook 7: `before_retrieve_html`** + - **Purpose**: Runs right before Crawl4AI initiates HTML retrieval. + - **Use Case**: + - Finalizing any page adjustments (e.g., setting timers, waiting for specific elements). + - **Example**: + ```python + async def wait_for_content_before_retrieve(page): + await page.wait_for_selector('.main-content') + print("Main content loaded, ready to retrieve HTML") + + crawler.crawler_strategy.set_hook('before_retrieve_html', wait_for_content_before_retrieve) + ``` + - **Explanation**: This hook waits for the main content to load before retrieving the HTML, ensuring that all essential content is captured. + +#### **4. Setting Hooks in Crawl4AI** + - **How to Set Hooks**: + - Use `set_hook` to define a custom function for each hook. + - Each hook function can be asynchronous (useful for actions like waiting or retrieving async data). + - **Example Setup**: + ```python + crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation) + crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto) + crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll) + ``` + +#### **5. Complete Example: Using Hooks for a Customized Crawl Workflow** + - **Goal**: Log each key step, set custom headers before navigation, and clean up the page before retrieving HTML. + - **Example Code**: + ```python + async def custom_crawl(): + async with AsyncWebCrawler() as crawler: + # Set hooks for custom workflow + crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation) + crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto) + crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll) + crawler.crawler_strategy.set_hook('before_return_html', remove_advertisements) + + # Perform the crawl + url = "https://example.com" + result = await crawler.arun(url=url) + print(result.html) # Display or process HTML + ``` + +#### **6. Benefits of Using Hooks in Custom Crawling Workflows** + - **Enhanced Control**: Hooks offer precise control over each stage, allowing adjustments based on content and structure. + - **Efficient Modifications**: Avoid reloading or restarting the session; hooks can alter actions dynamically. + - **Context-Sensitive Actions**: Hooks enable custom logic tailored to specific pages or sections, maximizing extraction quality. + +#### **7. Wrap Up & Next Steps** + - Recap how hooks empower customized workflows in Crawl4AI, enabling flexibility at every stage. + - Tease the next video: **Automating Post-Processing with Crawl4AI**, covering automated steps after data extraction. + +--- + +This outline provides a thorough understanding of hooks, their practical applications, and examples for customizing the crawling workflow in Crawl4AI. \ No newline at end of file diff --git a/docs/md_v2/tutorial/tutorial.md b/docs/md_v2/tutorial/tutorial.md new file mode 100644 index 0000000000000000000000000000000000000000..7bead8424708fb3b7a5a5143be79d40480302134 --- /dev/null +++ b/docs/md_v2/tutorial/tutorial.md @@ -0,0 +1,1789 @@ +# Crawl4AI + +## Episode 1: Introduction to Crawl4AI and Basic Installation + +### Quick Intro +Walk through installation from PyPI, setup, and verification. Show how to install with options like `torch` or `transformer` for advanced capabilities. + +Here's a condensed outline of the **Installation and Setup** video content: + +--- + +1) **Introduction to Crawl4AI**: + + - Briefly explain that Crawl4AI is a powerful tool for web scraping, data extraction, and content processing, with customizable options for various needs. + +2) **Installation Overview**: + + - **Basic Install**: Run `pip install crawl4ai` and `playwright install` (to set up browser dependencies). + - **Optional Advanced Installs**: + - `pip install crawl4ai[torch]` - Adds PyTorch for clustering. + - `pip install crawl4ai[transformer]` - Adds support for LLM-based extraction. + - `pip install crawl4ai[all]` - Installs all features for complete functionality. + +3) **Verifying the Installation**: + + - Walk through a simple test script to confirm the setup: + ```python + import asyncio + from crawl4ai import AsyncWebCrawler, CacheMode + + async def main(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url="https://www.example.com") + print(result.markdown[:500]) # Show first 500 characters + + asyncio.run(main()) + ``` + - Explain that this script initializes the crawler and runs it on a test URL, displaying part of the extracted content to verify functionality. + +4) **Important Tips**: + + - **Run** `playwright install` **after installation** to set up dependencies. + - **For full performance** on text-related tasks, run `crawl4ai-download-models` after installing with `[torch]`, `[transformer]`, or `[all]` options. + - If you encounter issues, refer to the documentation or GitHub issues. + +5) **Wrap Up**: + + - Introduce the next topic in the series, which will cover Crawl4AI's browser configuration options (like choosing between `chromium`, `firefox`, and `webkit`). + +--- + +This structure provides a concise, effective guide to get viewers up and running with Crawl4AI in minutes.# Crawl4AI + +## Episode 2: Overview of Advanced Features + +### Quick Intro +A general overview of advanced features like hooks, CSS selectors, and JSON CSS extraction. + +Here's a condensed outline for an **Overview of Advanced Features** video covering Crawl4AI's powerful customization and extraction options: + +--- + +### **Overview of Advanced Features** + +1) **Introduction to Advanced Features**: + + - Briefly introduce Crawl4AI’s advanced tools, which let users go beyond basic crawling to customize and fine-tune their scraping workflows. + +2) **Taking Screenshots**: + + - Explain the screenshot capability for capturing page state and verifying content. + - **Example**: + ```python + result = await crawler.arun(url="https://www.example.com", screenshot=True) + ``` + - Mention that screenshots are saved as a base64 string in `result`, allowing easy decoding and saving. + +3) **Media and Link Extraction**: + + - Demonstrate how to pull all media (images, videos) and links (internal and external) from a page for deeper analysis or content gathering. + - **Example**: + ```python + result = await crawler.arun(url="https://www.example.com") + print("Media:", result.media) + print("Links:", result.links) + ``` + +4) **Custom User Agent**: + + - Show how to set a custom user agent to disguise the crawler or simulate specific devices/browsers. + - **Example**: + ```python + result = await crawler.arun(url="https://www.example.com", user_agent="Mozilla/5.0 (compatible; MyCrawler/1.0)") + ``` + +5) **Custom Hooks for Enhanced Control**: + + - Briefly cover how to use hooks, which allow custom actions like setting headers or handling login during the crawl. + - **Example**: Setting a custom header with `before_get_url` hook. + ```python + async def before_get_url(page): + await page.set_extra_http_headers({"X-Test-Header": "test"}) + ``` + +6) **CSS Selectors for Targeted Extraction**: + + - Explain the use of CSS selectors to extract specific elements, ideal for structured data like articles or product details. + - **Example**: + ```python + result = await crawler.arun(url="https://www.example.com", css_selector="h2") + print("H2 Tags:", result.extracted_content) + ``` + +7) **Crawling Inside Iframes**: + + - Mention how enabling `process_iframes=True` allows extracting content within iframes, useful for sites with embedded content or ads. + - **Example**: + ```python + result = await crawler.arun(url="https://www.example.com", process_iframes=True) + ``` + +8) **Wrap-Up**: + + - Summarize these advanced features and how they allow users to customize every part of their web scraping experience. + - Tease upcoming videos where each feature will be explored in detail. + +--- + +This covers each advanced feature with a brief example, providing a useful overview to prepare viewers for the more in-depth videos.# Crawl4AI + +## Episode 3: Browser Configurations & Headless Crawling + +### Quick Intro +Explain browser options (`chromium`, `firefox`, `webkit`) and settings for headless mode, caching, and verbose logging. + +Here’s a streamlined outline for the **Browser Configurations & Headless Crawling** video: + +--- + +### **Browser Configurations & Headless Crawling** + +1) **Overview of Browser Options**: + + - Crawl4AI supports three browser engines: + - **Chromium** (default) - Highly compatible. + - **Firefox** - Great for specialized use cases. + - **Webkit** - Lightweight, ideal for basic needs. + - **Example**: + ```python + # Using Chromium (default) + crawler = AsyncWebCrawler(browser_type="chromium") + + # Using Firefox + crawler = AsyncWebCrawler(browser_type="firefox") + + # Using WebKit + crawler = AsyncWebCrawler(browser_type="webkit") + ``` + +2) **Headless Mode**: + + - Headless mode runs the browser without a visible GUI, making it faster and less resource-intensive. + - To enable or disable: + ```python + # Headless mode (default is True) + crawler = AsyncWebCrawler(headless=True) + + # Disable headless mode for debugging + crawler = AsyncWebCrawler(headless=False) + ``` + +3) **Verbose Logging**: + + - Use `verbose=True` to get detailed logs for each action, useful for debugging: + ```python + crawler = AsyncWebCrawler(verbose=True) + ``` + +4) **Running a Basic Crawl with Configuration**: + + - Example of a simple crawl with custom browser settings: + ```python + async with AsyncWebCrawler(browser_type="firefox", headless=True, verbose=True) as crawler: + result = await crawler.arun(url="https://www.example.com") + print(result.markdown[:500]) # Show first 500 characters + ``` + - This example uses Firefox in headless mode with logging enabled, demonstrating the flexibility of Crawl4AI’s setup. + +5) **Recap & Next Steps**: + + - Recap the power of selecting different browsers and running headless mode for speed and efficiency. + - Tease the next video: **Proxy & Security Settings** for navigating blocked or restricted content and protecting IP identity. + +--- + +This breakdown covers browser configuration essentials in Crawl4AI, providing users with practical steps to optimize their scraping setup.# Crawl4AI + +## Episode 4: Advanced Proxy and Security Settings + +### Quick Intro +Showcase proxy configurations (HTTP, SOCKS5, authenticated proxies). Demo: Use rotating proxies and set custom headers to avoid IP blocking and enhance security. + +Here’s a focused outline for the **Proxy and Security Settings** video: + +--- + +### **Proxy & Security Settings** + +1) **Why Use Proxies in Web Crawling**: + + - Proxies are essential for bypassing IP-based restrictions, improving anonymity, and managing rate limits. + - Crawl4AI supports simple proxies, authenticated proxies, and proxy rotation for robust web scraping. + +2) **Basic Proxy Setup**: + + - **Using a Simple Proxy**: + ```python + # HTTP proxy + crawler = AsyncWebCrawler(proxy="http://proxy.example.com:8080") + + # SOCKS proxy + crawler = AsyncWebCrawler(proxy="socks5://proxy.example.com:1080") + ``` + +3) **Authenticated Proxies**: + + - Use `proxy_config` for proxies requiring a username and password: + ```python + proxy_config = { + "server": "http://proxy.example.com:8080", + "username": "user", + "password": "pass" + } + crawler = AsyncWebCrawler(proxy_config=proxy_config) + ``` + +4) **Rotating Proxies**: + + - Rotating proxies helps avoid IP bans by switching IP addresses for each request: + ```python + async def get_next_proxy(): + # Define proxy rotation logic here + return {"server": "http://next.proxy.com:8080"} + + async with AsyncWebCrawler() as crawler: + for url in urls: + proxy = await get_next_proxy() + crawler.update_proxy(proxy) + result = await crawler.arun(url=url) + ``` + - This setup periodically switches the proxy for enhanced security and access. + +5) **Custom Headers for Additional Security**: + + - Set custom headers to mask the crawler’s identity and avoid detection: + ```python + headers = { + "X-Forwarded-For": "203.0.113.195", + "Accept-Language": "en-US,en;q=0.9", + "Cache-Control": "no-cache", + "Pragma": "no-cache" + } + crawler = AsyncWebCrawler(headers=headers) + ``` + +6) **Combining Proxies with Magic Mode for Anti-Bot Protection**: + + - For sites with aggressive bot detection, combine `proxy` settings with `magic=True`: + ```python + async with AsyncWebCrawler(proxy="http://proxy.example.com:8080", headers={"Accept-Language": "en-US"}) as crawler: + result = await crawler.arun( + url="https://example.com", + magic=True # Enables anti-detection features + ) + ``` + - **Magic Mode** automatically enables user simulation, random timing, and browser property masking. + +7) **Wrap Up & Next Steps**: + + - Summarize the importance of proxies and anti-detection in accessing restricted content and avoiding bans. + - Tease the next video: **JavaScript Execution and Handling Dynamic Content** for working with interactive and dynamically loaded pages. + +--- + +This outline provides a practical guide to setting up proxies and security configurations, empowering users to navigate restricted sites while staying undetected.# Crawl4AI + +## Episode 5: JavaScript Execution and Dynamic Content Handling + +### Quick Intro +Explain JavaScript code injection with examples (e.g., simulating scrolling, clicking ‘load more’). Demo: Extract content from a page that uses dynamic loading with lazy-loaded images. + +Here’s a focused outline for the **JavaScript Execution and Dynamic Content Handling** video: + +--- + +### **JavaScript Execution & Dynamic Content Handling** + +1) **Why JavaScript Execution Matters**: + + - Many modern websites load content dynamically via JavaScript, requiring special handling to access all elements. + - Crawl4AI can execute JavaScript on pages, enabling it to interact with elements like “load more” buttons, infinite scrolls, and content that appears only after certain actions. + +2) **Basic JavaScript Execution**: + + - Use `js_code` to execute JavaScript commands on a page: + ```python + # Scroll to bottom of the page + result = await crawler.arun( + url="https://example.com", + js_code="window.scrollTo(0, document.body.scrollHeight);" + ) + ``` + - This command scrolls to the bottom, triggering any lazy-loaded or dynamically added content. + +3) **Multiple Commands & Simulating Clicks**: + + - Combine multiple JavaScript commands to interact with elements like “load more” buttons: + ```python + js_commands = [ + "window.scrollTo(0, document.body.scrollHeight);", + "document.querySelector('.load-more').click();" + ] + result = await crawler.arun( + url="https://example.com", + js_code=js_commands + ) + ``` + - This script scrolls down and then clicks the “load more” button, useful for loading additional content blocks. + +4) **Waiting for Dynamic Content**: + + - Use `wait_for` to ensure the page loads specific elements before proceeding: + ```python + result = await crawler.arun( + url="https://example.com", + js_code="window.scrollTo(0, document.body.scrollHeight);", + wait_for="css:.dynamic-content" # Wait for elements with class `.dynamic-content` + ) + ``` + - This example waits until elements with `.dynamic-content` are loaded, helping to capture content that appears after JavaScript actions. + +5) **Handling Complex Dynamic Content (e.g., Infinite Scroll)**: + + - Combine JavaScript execution with conditional waiting to handle infinite scrolls or paginated content: + ```python + result = await crawler.arun( + url="https://example.com", + js_code=[ + "window.scrollTo(0, document.body.scrollHeight);", + "const loadMore = document.querySelector('.load-more'); if (loadMore) loadMore.click();" + ], + wait_for="js:() => document.querySelectorAll('.item').length > 10" # Wait until 10 items are loaded + ) + ``` + - This example scrolls and clicks "load more" repeatedly, waiting each time for a specified number of items to load. + +6) **Complete Example: Dynamic Content Handling with Extraction**: + + - Full example demonstrating a dynamic load and content extraction in one process: + ```python + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + js_code=[ + "window.scrollTo(0, document.body.scrollHeight);", + "document.querySelector('.load-more').click();" + ], + wait_for="css:.main-content", + css_selector=".main-content" + ) + print(result.markdown[:500]) # Output the main content extracted + ``` + +7) **Wrap Up & Next Steps**: + + - Recap how JavaScript execution allows access to dynamic content, enabling powerful interactions. + - Tease the next video: **Content Cleaning and Fit Markdown** to show how Crawl4AI can extract only the most relevant content from complex pages. + +--- + +This outline explains how to handle dynamic content and JavaScript-based interactions effectively, enabling users to scrape and interact with complex, modern websites.# Crawl4AI + +## Episode 6: Magic Mode and Anti-Bot Protection + +### Quick Intro +Highlight `Magic Mode` and anti-bot features like user simulation, navigator overrides, and timing randomization. Demo: Access a site with anti-bot protection and show how `Magic Mode` seamlessly handles it. + +Here’s a concise outline for the **Magic Mode and Anti-Bot Protection** video: + +--- + +### **Magic Mode & Anti-Bot Protection** + +1) **Why Anti-Bot Protection is Important**: + + - Many websites use bot detection mechanisms to block automated scraping. Crawl4AI’s anti-detection features help avoid IP bans, CAPTCHAs, and access restrictions. + - **Magic Mode** is a one-step solution to enable a range of anti-bot features without complex configuration. + +2) **Enabling Magic Mode**: + + - Simply set `magic=True` to activate Crawl4AI’s full anti-bot suite: + ```python + result = await crawler.arun( + url="https://example.com", + magic=True # Enables all anti-detection features + ) + ``` + - This enables a blend of stealth techniques, including masking automation signals, randomizing timings, and simulating real user behavior. + +3) **What Magic Mode Does Behind the Scenes**: + + - **User Simulation**: Mimics human actions like mouse movements and scrolling. + - **Navigator Overrides**: Hides signals that indicate an automated browser. + - **Timing Randomization**: Adds random delays to simulate natural interaction patterns. + - **Cookie Handling**: Accepts and manages cookies dynamically to avoid triggers from cookie pop-ups. + +4) **Manual Anti-Bot Options (If Not Using Magic Mode)**: + + - For granular control, you can configure individual settings without Magic Mode: + ```python + result = await crawler.arun( + url="https://example.com", + simulate_user=True, # Enables human-like behavior + override_navigator=True # Hides automation fingerprints + ) + ``` + - **Use Cases**: This approach allows more specific adjustments when certain anti-bot features are needed but others are not. + +5) **Combining Proxies with Magic Mode**: + + - To avoid rate limits or IP blocks, combine Magic Mode with a proxy: + ```python + async with AsyncWebCrawler( + proxy="http://proxy.example.com:8080", + headers={"Accept-Language": "en-US"} + ) as crawler: + result = await crawler.arun( + url="https://example.com", + magic=True # Full anti-detection + ) + ``` + - This setup maximizes stealth by pairing anti-bot detection with IP obfuscation. + +6) **Example of Anti-Bot Protection in Action**: + + - Full example with Magic Mode and proxies to scrape a protected page: + ```python + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/protected-content", + magic=True, + proxy="http://proxy.example.com:8080", + wait_for="css:.content-loaded" # Wait for the main content to load + ) + print(result.markdown[:500]) # Display first 500 characters of the content + ``` + - This example ensures seamless access to protected content by combining anti-detection and waiting for full content load. + +7) **Wrap Up & Next Steps**: + + - Recap the power of Magic Mode and anti-bot features for handling restricted websites. + - Tease the next video: **Content Cleaning and Fit Markdown** to show how to extract clean and focused content from a page. + +--- + +This outline shows users how to easily avoid bot detection and access restricted content, demonstrating both the power and simplicity of Magic Mode in Crawl4AI.# Crawl4AI + +## Episode 7: Content Cleaning and Fit Markdown + +### Quick Intro +Explain content cleaning options, including `fit_markdown` to keep only the most relevant content. Demo: Extract and compare regular vs. fit markdown from a news site or blog. + +Here’s a streamlined outline for the **Content Cleaning and Fit Markdown** video: + +--- + +### **Content Cleaning & Fit Markdown** + +1) **Overview of Content Cleaning in Crawl4AI**: + + - Explain that web pages often include extra elements like ads, navigation bars, footers, and popups. + - Crawl4AI’s content cleaning features help extract only the main content, reducing noise and enhancing readability. + +2) **Basic Content Cleaning Options**: + + - **Removing Unwanted Elements**: Exclude specific HTML tags, like forms or navigation bars: + ```python + result = await crawler.arun( + url="https://example.com", + word_count_threshold=10, # Filter out blocks with fewer than 10 words + excluded_tags=['form', 'nav'], # Exclude specific tags + remove_overlay_elements=True # Remove popups and modals + ) + ``` + - This example extracts content while excluding forms, navigation, and modal overlays, ensuring clean results. + +3) **Fit Markdown for Main Content Extraction**: + + - **What is Fit Markdown**: Uses advanced analysis to identify the most relevant content (ideal for articles, blogs, and documentation). + - **How it Works**: Analyzes content density, removes boilerplate elements, and maintains formatting for a clear output. + - **Example**: + ```python + result = await crawler.arun(url="https://example.com") + main_content = result.fit_markdown # Extracted main content + print(main_content[:500]) # Display first 500 characters + ``` + - Fit Markdown is especially helpful for long-form content like news articles or blog posts. + +4) **Comparing Fit Markdown with Regular Markdown**: + + - **Fit Markdown** returns the primary content without extraneous elements. + - **Regular Markdown** includes all extracted text in markdown format. + - Example to show the difference: + ```python + all_content = result.markdown # Full markdown + main_content = result.fit_markdown # Only the main content + + print(f"All Content Length: {len(all_content)}") + print(f"Main Content Length: {len(main_content)}") + ``` + - This comparison shows the effectiveness of Fit Markdown in focusing on essential content. + +5) **Media and Metadata Handling with Content Cleaning**: + + - **Media Extraction**: Crawl4AI captures images and videos with metadata like alt text, descriptions, and relevance scores: + ```python + for image in result.media["images"]: + print(f"Source: {image['src']}, Alt Text: {image['alt']}, Relevance Score: {image['score']}") + ``` + - **Use Case**: Useful for saving only relevant images or videos from an article or content-heavy page. + +6) **Example of Clean Content Extraction in Action**: + + - Full example extracting cleaned content and Fit Markdown: + ```python + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + word_count_threshold=10, + excluded_tags=['nav', 'footer'], + remove_overlay_elements=True + ) + print(result.fit_markdown[:500]) # Show main content + ``` + - This example demonstrates content cleaning with settings for filtering noise and focusing on the core text. + +7) **Wrap Up & Next Steps**: + + - Summarize the power of Crawl4AI’s content cleaning features and Fit Markdown for capturing clean, relevant content. + - Tease the next video: **Link Analysis and Smart Filtering** to focus on analyzing and filtering links within crawled pages. + +--- + +This outline covers Crawl4AI’s content cleaning features and the unique benefits of Fit Markdown, showing users how to retrieve focused, high-quality content from web pages.# Crawl4AI + +## Episode 8: Media Handling: Images, Videos, and Audio + +### Quick Intro +Showcase Crawl4AI’s media extraction capabilities, including lazy-loaded media and metadata. Demo: Crawl a multimedia page, extract images, and show metadata (alt text, context, relevance score). + +Here’s a clear and focused outline for the **Media Handling: Images, Videos, and Audio** video: + +--- + +### **Media Handling: Images, Videos, and Audio** + +1) **Overview of Media Extraction in Crawl4AI**: + + - Crawl4AI can detect and extract different types of media (images, videos, and audio) along with useful metadata. + - This functionality is essential for gathering visual content from multimedia-heavy pages like e-commerce sites, news articles, and social media feeds. + +2) **Image Extraction and Metadata**: + + - Crawl4AI captures images with detailed metadata, including: + - **Source URL**: The direct URL to the image. + - **Alt Text**: Image description if available. + - **Relevance Score**: A score (0–10) indicating how relevant the image is to the main content. + - **Context**: Text surrounding the image on the page. + - **Example**: + ```python + result = await crawler.arun(url="https://example.com") + + for image in result.media["images"]: + print(f"Source: {image['src']}") + print(f"Alt Text: {image['alt']}") + print(f"Relevance Score: {image['score']}") + print(f"Context: {image['context']}") + ``` + - This example shows how to access each image’s metadata, making it easy to filter for the most relevant visuals. + +3) **Handling Lazy-Loaded Images**: + + - Crawl4AI automatically supports lazy-loaded images, which are commonly used to optimize webpage loading. + - **Example with Wait for Lazy-Loaded Content**: + ```python + result = await crawler.arun( + url="https://example.com", + wait_for="css:img[data-src]", # Wait for lazy-loaded images + delay_before_return_html=2.0 # Allow extra time for images to load + ) + ``` + - This setup waits for lazy-loaded images to appear, ensuring they are fully captured. + +4) **Video Extraction and Metadata**: + + - Crawl4AI captures video elements, including: + - **Source URL**: The video’s direct URL. + - **Type**: Format of the video (e.g., MP4). + - **Thumbnail**: A poster or thumbnail image if available. + - **Duration**: Video length, if metadata is provided. + - **Example**: + ```python + for video in result.media["videos"]: + print(f"Video Source: {video['src']}") + print(f"Type: {video['type']}") + print(f"Thumbnail: {video.get('poster')}") + print(f"Duration: {video.get('duration')}") + ``` + - This allows users to gather video content and relevant details for further processing or analysis. + +5) **Audio Extraction and Metadata**: + + - Audio elements can also be extracted, with metadata like: + - **Source URL**: The audio file’s direct URL. + - **Type**: Format of the audio file (e.g., MP3). + - **Duration**: Length of the audio, if available. + - **Example**: + ```python + for audio in result.media["audios"]: + print(f"Audio Source: {audio['src']}") + print(f"Type: {audio['type']}") + print(f"Duration: {audio.get('duration')}") + ``` + - Useful for sites with podcasts, sound bites, or other audio content. + +6) **Filtering Media by Relevance**: + + - Use metadata like relevance score to filter only the most useful media content: + ```python + relevant_images = [img for img in result.media["images"] if img['score'] > 5] + ``` + - This is especially helpful for content-heavy pages where you only want media directly related to the main content. + +7) **Example: Full Media Extraction with Content Filtering**: + + - Full example extracting images, videos, and audio along with filtering by relevance: + ```python + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + word_count_threshold=10, # Filter content blocks for relevance + exclude_external_images=True # Only keep internal images + ) + + # Display media summaries + print(f"Relevant Images: {len(relevant_images)}") + print(f"Videos: {len(result.media['videos'])}") + print(f"Audio Clips: {len(result.media['audios'])}") + ``` + - This example shows how to capture and filter various media types, focusing on what’s most relevant. + +8) **Wrap Up & Next Steps**: + + - Recap the comprehensive media extraction capabilities, emphasizing how metadata helps users focus on relevant content. + - Tease the next video: **Link Analysis and Smart Filtering** to explore how Crawl4AI handles internal, external, and social media links for more focused data gathering. + +--- + +This outline provides users with a complete guide to handling images, videos, and audio in Crawl4AI, using metadata to enhance relevance and precision in multimedia extraction.# Crawl4AI + +## Episode 9: Link Analysis and Smart Filtering + +### Quick Intro +Walk through internal and external link classification, social media link filtering, and custom domain exclusion. Demo: Analyze links on a website, focusing on internal navigation vs. external or ad links. + +Here’s a focused outline for the **Link Analysis and Smart Filtering** video: + +--- + +### **Link Analysis & Smart Filtering** + +1) **Importance of Link Analysis in Web Crawling**: + + - Explain that web pages often contain numerous links, including internal links, external links, social media links, and ads. + - Crawl4AI’s link analysis and filtering options help extract only relevant links, enabling more targeted and efficient crawls. + +2) **Automatic Link Classification**: + + - Crawl4AI categorizes links automatically into internal, external, and social media links. + - **Example**: + ```python + result = await crawler.arun(url="https://example.com") + + # Access internal and external links + internal_links = result.links["internal"] + external_links = result.links["external"] + + # Print first few links for each type + print("Internal Links:", internal_links[:3]) + print("External Links:", external_links[:3]) + ``` + +3) **Filtering Out Unwanted Links**: + + - **Exclude External Links**: Remove all links pointing to external sites. + - **Exclude Social Media Links**: Filter out social media domains like Facebook or Twitter. + - **Example**: + ```python + result = await crawler.arun( + url="https://example.com", + exclude_external_links=True, # Remove external links + exclude_social_media_links=True # Remove social media links + ) + ``` + +4) **Custom Domain Filtering**: + + - **Exclude Specific Domains**: Filter links from particular domains, e.g., ad sites. + - **Custom Social Media Domains**: Add additional social media domains if needed. + - **Example**: + ```python + result = await crawler.arun( + url="https://example.com", + exclude_domains=["ads.com", "trackers.com"], + exclude_social_media_domains=["facebook.com", "linkedin.com"] + ) + ``` + +5) **Accessing Link Context and Metadata**: + + - Crawl4AI provides additional metadata for each link, including its text, type (e.g., navigation or content), and surrounding context. + - **Example**: + ```python + for link in result.links["internal"]: + print(f"Link: {link['href']}, Text: {link['text']}, Context: {link['context']}") + ``` + - **Use Case**: Helps users understand the relevance of links based on where they are placed on the page (e.g., navigation vs. article content). + +6) **Example of Comprehensive Link Filtering and Analysis**: + + - Full example combining link filtering, metadata access, and contextual information: + ```python + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + exclude_external_links=True, + exclude_social_media_links=True, + exclude_domains=["ads.com"], + css_selector=".main-content" # Focus only on main content area + ) + for link in result.links["internal"]: + print(f"Internal Link: {link['href']}, Text: {link['text']}, Context: {link['context']}") + ``` + - This example filters unnecessary links, keeping only internal and relevant links from the main content area. + +7) **Wrap Up & Next Steps**: + + - Summarize the benefits of link filtering for efficient crawling and relevant content extraction. + - Tease the next video: **Custom Headers, Identity Management, and User Simulation** to explain how to configure identity settings and simulate user behavior for stealthier crawls. + +--- + +This outline provides a practical overview of Crawl4AI’s link analysis and filtering features, helping users target only essential links while eliminating distractions.# Crawl4AI + +## Episode 10: Custom Headers, Identity, and User Simulation + +### Quick Intro +Teach how to use custom headers, user-agent strings, and simulate real user interactions. Demo: Set custom user-agent and headers to access a site that blocks typical crawlers. + +Here’s a concise outline for the **Custom Headers, Identity Management, and User Simulation** video: + +--- + +### **Custom Headers, Identity Management, & User Simulation** + +1) **Why Customize Headers and Identity in Crawling**: + + - Websites often track request headers and browser properties to detect bots. Customizing headers and managing identity help make requests appear more human, improving access to restricted sites. + +2) **Setting Custom Headers**: + + - Customize HTTP headers to mimic genuine browser requests or meet site-specific requirements: + ```python + headers = { + "Accept-Language": "en-US,en;q=0.9", + "X-Requested-With": "XMLHttpRequest", + "Cache-Control": "no-cache" + } + crawler = AsyncWebCrawler(headers=headers) + ``` + - **Use Case**: Customize the `Accept-Language` header to simulate local user settings, or `Cache-Control` to bypass cache for fresh content. + +3) **Setting a Custom User Agent**: + + - Some websites block requests from common crawler user agents. Setting a custom user agent string helps bypass these restrictions: + ```python + crawler = AsyncWebCrawler( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + ) + ``` + - **Tip**: Use user-agent strings from popular browsers (e.g., Chrome, Firefox) to improve access and reduce detection risks. + +4) **User Simulation for Human-like Behavior**: + + - Enable `simulate_user=True` to mimic natural user interactions, such as random timing and simulated mouse movements: + ```python + result = await crawler.arun( + url="https://example.com", + simulate_user=True # Simulates human-like behavior + ) + ``` + - **Behavioral Effects**: Adds subtle variations in interactions, making the crawler harder to detect on bot-protected sites. + +5) **Navigator Overrides and Magic Mode for Full Identity Masking**: + + - Use `override_navigator=True` to mask automation indicators like `navigator.webdriver`, which websites check to detect bots: + ```python + result = await crawler.arun( + url="https://example.com", + override_navigator=True # Masks bot-related signals + ) + ``` + - **Combining with Magic Mode**: For a complete anti-bot setup, combine these identity options with `magic=True` for maximum protection: + ```python + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + magic=True, # Enables all anti-bot detection features + user_agent="Custom-Agent", # Custom agent with Magic Mode + ) + ``` + - This setup includes all anti-detection techniques like navigator masking, random timing, and user simulation. + +6) **Example: Comprehensive Setup for Identity Management**: + + - A full example combining custom headers, user-agent, and user simulation for a realistic browsing profile: + ```python + async with AsyncWebCrawler( + headers={"Accept-Language": "en-US", "Cache-Control": "no-cache"}, + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0", + ) as crawler: + result = await crawler.arun( + url="https://example.com/secure-page", + simulate_user=True + ) + print(result.markdown[:500]) # Display extracted content + ``` + - This example enables detailed customization for evading detection and accessing protected pages smoothly. + +7) **Wrap Up & Next Steps**: + + - Recap the value of headers, user-agent customization, and simulation in bypassing bot detection. + - Tease the next video: **Extraction Strategies: JSON CSS, LLM, and Cosine** to dive into structured data extraction methods for high-quality content retrieval. + +--- + +This outline equips users with tools for managing crawler identity and human-like behavior, essential for accessing bot-protected or restricted websites.Here’s a detailed outline for the **JSON-CSS Extraction Strategy** video, covering all key aspects and supported structures in Crawl4AI: + +--- + +### **10.1 JSON-CSS Extraction Strategy** + +#### **1. Introduction to JSON-CSS Extraction** + - JSON-CSS Extraction is used for pulling structured data from pages with repeated patterns, like product listings, article feeds, or directories. + - This strategy allows defining a schema with CSS selectors and data fields, making it easy to capture nested, list-based, or singular elements. + +#### **2. Basic Schema Structure** + - **Schema Fields**: The schema has two main components: + - `baseSelector`: A CSS selector to locate the main elements you want to extract (e.g., each article or product block). + - `fields`: Defines the data fields for each element, supporting various data types and structures. + +#### **3. Simple Field Extraction** + - **Example HTML**: + ```html +
    +

    Sample Product

    + $19.99 +

    This is a sample product.

    +
    + ``` + - **Schema**: + ```python + schema = { + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": ".title", "type": "text"}, + {"name": "price", "selector": ".price", "type": "text"}, + {"name": "description", "selector": ".description", "type": "text"} + ] + } + ``` + - **Explanation**: Each field captures text content from specified CSS selectors within each `.product` element. + +#### **4. Supported Field Types: Text, Attribute, HTML, Regex** + - **Field Type Options**: + - `text`: Extracts visible text. + - `attribute`: Captures an HTML attribute (e.g., `src`, `href`). + - `html`: Extracts the raw HTML of an element. + - `regex`: Allows regex patterns to extract part of the text. + + - **Example HTML** (including an image): + ```html +
    +

    Sample Product

    + Product Image + $19.99 +

    Limited time offer.

    +
    + ``` + - **Schema**: + ```python + schema = { + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": ".title", "type": "text"}, + {"name": "image_url", "selector": ".product-image", "type": "attribute", "attribute": "src"}, + {"name": "price", "selector": ".price", "type": "regex", "pattern": r"\$(\d+\.\d+)"}, + {"name": "description_html", "selector": ".description", "type": "html"} + ] + } + ``` + - **Explanation**: + - `attribute`: Extracts the `src` attribute from `.product-image`. + - `regex`: Extracts the numeric part from `$19.99`. + - `html`: Retrieves the full HTML of the description element. + +#### **5. Nested Field Extraction** + - **Use Case**: Useful when content contains sub-elements, such as an article with author details within it. + - **Example HTML**: + ```html +
    +

    Sample Article

    +
    + John Doe + Writer and editor +
    +
    + ``` + - **Schema**: + ```python + schema = { + "baseSelector": ".article", + "fields": [ + {"name": "title", "selector": ".title", "type": "text"}, + {"name": "author", "type": "nested", "selector": ".author", "fields": [ + {"name": "name", "selector": ".name", "type": "text"}, + {"name": "bio", "selector": ".bio", "type": "text"} + ]} + ] + } + ``` + - **Explanation**: + - `nested`: Extracts `name` and `bio` within `.author`, grouping the author details in a single `author` object. + +#### **6. List and Nested List Extraction** + - **List**: Extracts multiple elements matching the selector as a list. + - **Nested List**: Allows lists within lists, useful for items with sub-lists (e.g., specifications for each product). + - **Example HTML**: + ```html +
    +

    Product with Features

    +
      +
    • Feature 1
    • +
    • Feature 2
    • +
    • Feature 3
    • +
    +
    + ``` + - **Schema**: + ```python + schema = { + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": ".title", "type": "text"}, + {"name": "features", "type": "list", "selector": ".features .feature", "fields": [ + {"name": "feature", "type": "text"} + ]} + ] + } + ``` + - **Explanation**: + - `list`: Captures each `.feature` item within `.features`, outputting an array of features under the `features` field. + +#### **7. Transformations for Field Values** + - Transformations allow you to modify extracted values (e.g., converting to lowercase). + - Supported transformations: `lowercase`, `uppercase`, `strip`. + - **Example HTML**: + ```html +
    +

    Special Product

    +
    + ``` + - **Schema**: + ```python + schema = { + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": ".title", "type": "text", "transform": "uppercase"} + ] + } + ``` + - **Explanation**: The `transform` property changes the `title` to uppercase, useful for standardized outputs. + +#### **8. Full JSON-CSS Extraction Example** + - Combining all elements in a single schema example for a comprehensive crawl: + - **Example HTML**: + ```html +
    +

    Featured Product

    + + $99.99 +

    Best product of the year.

    +
      +
    • Durable
    • +
    • Eco-friendly
    • +
    +
    + ``` + - **Schema**: + ```python + schema = { + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": ".title", "type": "text", "transform": "uppercase"}, + {"name": "image_url", "selector": ".product-image", "type": "attribute", "attribute": "src"}, + {"name": "price", "selector": ".price", "type": "regex", "pattern": r"\$(\d+\.\d+)"}, + {"name": "description", "selector": ".description", "type": "html"}, + {"name": "features", "type": "list", "selector": ".features .feature", "fields": [ + {"name": "feature", "type": "text"} + ]} + ] + } + ``` + - **Explanation**: This schema captures and transforms each aspect of the product, illustrating the JSON-CSS strategy’s versatility for structured extraction. + +#### **9. Wrap Up & Next Steps** + - Summarize JSON-CSS Extraction’s flexibility for structured, pattern-based extraction. + - Tease the next video: **10.2 LLM Extraction Strategy**, focusing on using language models to extract data based on intelligent content analysis. + +--- + +This outline covers each JSON-CSS Extraction option in Crawl4AI, with practical examples and schema configurations, making it a thorough guide for users.# Crawl4AI + +## Episode 11: Extraction Strategies: JSON CSS, LLM, and Cosine + +### Quick Intro +Introduce JSON CSS Extraction Strategy for structured data, LLM Extraction Strategy for intelligent parsing, and Cosine Strategy for clustering similar content. Demo: Use JSON CSS to scrape product details from an e-commerce site. + +Here’s a comprehensive outline for the **LLM Extraction Strategy** video, covering key details and example applications. + +--- + +### **10.2 LLM Extraction Strategy** + +#### **1. Introduction to LLM Extraction Strategy** + - The LLM Extraction Strategy leverages language models to interpret and extract structured data from complex web content. + - Unlike traditional CSS selectors, this strategy uses natural language instructions and schemas to guide the extraction, ideal for unstructured or diverse content. + - Supports **OpenAI**, **Azure OpenAI**, **HuggingFace**, and **Ollama** models, enabling flexibility with both proprietary and open-source providers. + +#### **2. Key Components of LLM Extraction Strategy** + - **Provider**: Specifies the LLM provider (e.g., OpenAI, HuggingFace, Azure). + - **API Token**: Required for most providers, except Ollama (local LLM model). + - **Instruction**: Custom extraction instructions sent to the model, providing flexibility in how the data is structured and extracted. + - **Schema**: Optional, defines structured fields to organize extracted data into JSON format. + - **Extraction Type**: Supports `"block"` for simpler text blocks or `"schema"` when a structured output format is required. + - **Chunking Parameters**: Breaks down large documents, with options to adjust chunk size and overlap rate for more accurate extraction across lengthy texts. + +#### **3. Basic Extraction Example: OpenAI Model Pricing** + - **Goal**: Extract model names and their input and output fees from the OpenAI pricing page. + - **Schema Definition**: + - **Model Name**: Text for model identification. + - **Input Fee**: Token cost for input processing. + - **Output Fee**: Token cost for output generation. + + - **Schema**: + ```python + class OpenAIModelFee(BaseModel): + model_name: str = Field(..., description="Name of the OpenAI model.") + input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") + output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") + ``` + + - **Example Code**: + ```python + async def extract_openai_pricing(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://openai.com/api/pricing/", + extraction_strategy=LLMExtractionStrategy( + provider="openai/gpt-4o", + api_token=os.getenv("OPENAI_API_KEY"), + schema=OpenAIModelFee.schema(), + extraction_type="schema", + instruction="Extract model names and fees for input and output tokens from the page." + ), + cache_mode=CacheMode.BYPASS + ) + print(result.extracted_content) + ``` + + - **Explanation**: + - The extraction strategy combines a schema and detailed instruction to guide the LLM in capturing structured data. + - Each model’s name, input fee, and output fee are extracted in a JSON format. + +#### **4. Knowledge Graph Extraction Example** + - **Goal**: Extract entities and their relationships from a document for use in a knowledge graph. + - **Schema Definition**: + - **Entities**: Individual items with descriptions (e.g., people, organizations). + - **Relationships**: Connections between entities, including descriptions and relationship types. + + - **Schema**: + ```python + class Entity(BaseModel): + name: str + description: str + + class Relationship(BaseModel): + entity1: Entity + entity2: Entity + description: str + relation_type: str + + class KnowledgeGraph(BaseModel): + entities: List[Entity] + relationships: List[Relationship] + ``` + + - **Example Code**: + ```python + async def extract_knowledge_graph(): + extraction_strategy = LLMExtractionStrategy( + provider="azure/gpt-4o-mini", + api_token=os.getenv("AZURE_API_KEY"), + schema=KnowledgeGraph.schema(), + extraction_type="schema", + instruction="Extract entities and relationships from the content to build a knowledge graph." + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/some-article", + extraction_strategy=extraction_strategy, + cache_mode=CacheMode.BYPASS + ) + print(result.extracted_content) + ``` + + - **Explanation**: + - In this setup, the LLM extracts entities and their relationships based on the schema and instruction. + - The schema organizes results into a JSON-based knowledge graph format. + +#### **5. Key Settings in LLM Extraction** + - **Chunking Options**: + - For long pages, set `chunk_token_threshold` to specify maximum token count per section. + - Adjust `overlap_rate` to control the overlap between chunks, useful for contextual consistency. + - **Example**: + ```python + extraction_strategy = LLMExtractionStrategy( + provider="openai/gpt-4", + api_token=os.getenv("OPENAI_API_KEY"), + chunk_token_threshold=3000, + overlap_rate=0.2, # 20% overlap between chunks + instruction="Extract key insights and relationships." + ) + ``` + - This setup ensures that longer texts are divided into manageable chunks with slight overlap, enhancing the quality of extraction. + +#### **6. Flexible Provider Options for LLM Extraction** + - **Using Proprietary Models**: OpenAI, Azure, and HuggingFace provide robust language models, often suited for complex or detailed extractions. + - **Using Open-Source Models**: Ollama and other open-source models can be deployed locally, suitable for offline or cost-effective extraction. + - **Example Call**: + ```python + await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) + await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + await extract_structured_data_using_llm("ollama/llama3.2") + ``` + +#### **7. Complete Example of LLM Extraction Setup** + - Code to run both the OpenAI pricing and Knowledge Graph extractions, using various providers: + ```python + async def main(): + await extract_openai_pricing() + await extract_knowledge_graph() + + if __name__ == "__main__": + asyncio.run(main()) + ``` + +#### **8. Wrap Up & Next Steps** + - Recap the power of LLM extraction for handling unstructured or complex data extraction tasks. + - Tease the next video: **10.3 Cosine Similarity Strategy** for clustering similar content based on semantic similarity. + +--- + +This outline explains LLM Extraction in Crawl4AI, with examples showing how to extract structured data using custom schemas and instructions. It demonstrates flexibility with multiple providers, ensuring practical application for different use cases.# Crawl4AI + +## Episode 11: Extraction Strategies: JSON CSS, LLM, and Cosine + +### Quick Intro +Introduce JSON CSS Extraction Strategy for structured data, LLM Extraction Strategy for intelligent parsing, and Cosine Strategy for clustering similar content. Demo: Use JSON CSS to scrape product details from an e-commerce site. + +Here’s a structured outline for the **Cosine Similarity Strategy** video, covering key concepts, configuration, and a practical example. + +--- + +### **10.3 Cosine Similarity Strategy** + +#### **1. Introduction to Cosine Similarity Strategy** + - The Cosine Similarity Strategy clusters content by semantic similarity, offering an efficient alternative to LLM-based extraction, especially when speed is a priority. + - Ideal for grouping similar sections of text, this strategy is well-suited for pages with content sections that may need to be classified or tagged, like news articles, product descriptions, or reviews. + +#### **2. Key Configuration Options** + - **semantic_filter**: A keyword-based filter to focus on relevant content. + - **word_count_threshold**: Minimum number of words per cluster, filtering out shorter, less meaningful clusters. + - **max_dist**: Maximum allowable distance between elements in clusters, impacting cluster tightness. + - **linkage_method**: Method for hierarchical clustering, such as `'ward'` (for well-separated clusters). + - **top_k**: Specifies the number of top categories for each cluster. + - **model_name**: Defines the model for embeddings, such as `sentence-transformers/all-MiniLM-L6-v2`. + - **sim_threshold**: Minimum similarity threshold for filtering, allowing control over cluster relevance. + +#### **3. How Cosine Similarity Clustering Works** + - **Step 1**: Embeddings are generated for each text section, transforming them into vectors that capture semantic meaning. + - **Step 2**: Hierarchical clustering groups similar sections based on cosine similarity, forming clusters with related content. + - **Step 3**: Clusters are filtered based on word count, removing those below the `word_count_threshold`. + - **Step 4**: Each cluster is then categorized with tags, if enabled, providing context to each grouped content section. + +#### **4. Example Use Case: Clustering Blog Article Sections** + - **Goal**: Group related sections of a blog or news page to identify distinct topics or discussion areas. + - **Example HTML Sections**: + ```text + "The economy is showing signs of recovery, with markets up this quarter.", + "In the sports world, several major teams are preparing for the upcoming season.", + "New advancements in AI technology are reshaping the tech landscape.", + "Market analysts are optimistic about continued growth in tech stocks." + ``` + + - **Code Setup**: + ```python + async def extract_blog_sections(): + extraction_strategy = CosineStrategy( + word_count_threshold=15, + max_dist=0.3, + sim_threshold=0.2, + model_name="sentence-transformers/all-MiniLM-L6-v2", + top_k=2 + ) + async with AsyncWebCrawler() as crawler: + url = "https://example.com/blog-page" + result = await crawler.arun( + url=url, + extraction_strategy=extraction_strategy, + cache_mode=CacheMode.BYPASS + ) + print(result.extracted_content) + ``` + + - **Explanation**: + - **word_count_threshold**: Ensures only clusters with meaningful content are included. + - **sim_threshold**: Filters out clusters with low similarity, focusing on closely related sections. + - **top_k**: Selects top tags, useful for identifying main topics. + +#### **5. Applying Semantic Filtering with Cosine Similarity** + - **Semantic Filter**: Filters sections based on relevance to a specific keyword, such as “technology” for tech articles. + - **Example Code**: + ```python + extraction_strategy = CosineStrategy( + semantic_filter="technology", + word_count_threshold=10, + max_dist=0.25, + model_name="sentence-transformers/all-MiniLM-L6-v2" + ) + ``` + - **Explanation**: + - **semantic_filter**: Only sections with high similarity to the “technology” keyword will be included in the clustering, making it easy to focus on specific topics within a mixed-content page. + +#### **6. Clustering Product Reviews by Similarity** + - **Goal**: Organize product reviews by themes, such as “price,” “quality,” or “durability.” + - **Example Reviews**: + ```text + "The quality of this product is outstanding and well worth the price.", + "I found the product to be durable but a bit overpriced.", + "Great value for the money and long-lasting.", + "The build quality is good, but I expected a lower price point." + ``` + + - **Code Setup**: + ```python + async def extract_product_reviews(): + extraction_strategy = CosineStrategy( + word_count_threshold=20, + max_dist=0.35, + sim_threshold=0.25, + model_name="sentence-transformers/all-MiniLM-L6-v2" + ) + async with AsyncWebCrawler() as crawler: + url = "https://example.com/product-reviews" + result = await crawler.arun( + url=url, + extraction_strategy=extraction_strategy, + cache_mode=CacheMode.BYPASS + ) + print(result.extracted_content) + ``` + + - **Explanation**: + - This configuration clusters similar reviews, grouping feedback by common themes, helping businesses understand customer sentiments around particular product aspects. + +#### **7. Performance Advantages of Cosine Strategy** + - **Speed**: The Cosine Similarity Strategy is faster than LLM-based extraction, as it doesn’t rely on API calls to external LLMs. + - **Local Processing**: The strategy runs locally with pre-trained sentence embeddings, ideal for high-throughput scenarios where cost and latency are concerns. + - **Comparison**: With a well-optimized local model, this method can perform clustering on large datasets quickly, making it suitable for tasks requiring rapid, repeated analysis. + +#### **8. Full Code Example for Clustering News Articles** + - **Code**: + ```python + async def main(): + await extract_blog_sections() + await extract_product_reviews() + + if __name__ == "__main__": + asyncio.run(main()) + ``` + +#### **9. Wrap Up & Next Steps** + - Recap the efficiency and effectiveness of Cosine Similarity for clustering related content quickly. + - Close with a reminder of Crawl4AI’s flexibility across extraction strategies, and prompt users to experiment with different settings to optimize clustering for their specific content. + +--- + +This outline covers Cosine Similarity Strategy’s speed and effectiveness, providing examples that showcase its potential for clustering various content types efficiently.# Crawl4AI + +## Episode 12: Session-Based Crawling for Dynamic Websites + +### Quick Intro +Show session management for handling websites with multiple pages or actions (like “load more” buttons). Demo: Crawl a paginated content page, persisting session data across multiple requests. + +Here’s a detailed outline for the **Session-Based Crawling for Dynamic Websites** video, explaining why sessions are necessary, how to use them, and providing practical examples and a visual diagram to illustrate the concept. + +--- + +### **11. Session-Based Crawling for Dynamic Websites** + +#### **1. Introduction to Session-Based Crawling** + - **What is Session-Based Crawling**: Session-based crawling maintains a continuous browsing session across multiple page states, allowing the crawler to interact with a page and retrieve content that loads dynamically or based on user interactions. + - **Why It’s Needed**: + - In static pages, all content is available directly from a single URL. + - In dynamic websites, content often loads progressively or based on user actions (e.g., clicking “load more,” submitting forms, scrolling). + - Session-based crawling helps simulate user actions, capturing content that is otherwise hidden until specific actions are taken. + +#### **2. Conceptual Diagram for Session-Based Crawling** + + ```mermaid + graph TD + Start[Start Session] --> S1[Initial State (S1)] + S1 -->|Crawl| Content1[Extract Content S1] + S1 -->|Action: Click Load More| S2[State S2] + S2 -->|Crawl| Content2[Extract Content S2] + S2 -->|Action: Scroll Down| S3[State S3] + S3 -->|Crawl| Content3[Extract Content S3] + S3 -->|Action: Submit Form| S4[Final State] + S4 -->|Crawl| Content4[Extract Content S4] + Content4 --> End[End Session] + ``` + + - **Explanation of Diagram**: + - **Start**: Initializes the session and opens the starting URL. + - **State Transitions**: Each action (e.g., clicking “load more,” scrolling) transitions to a new state, where additional content becomes available. + - **Session Persistence**: Keeps the same browsing session active, preserving the state and allowing for a sequence of actions to unfold. + - **End**: After reaching the final state, the session ends, and all accumulated content has been extracted. + +#### **3. Key Components of Session-Based Crawling in Crawl4AI** + - **Session ID**: A unique identifier to maintain the state across requests, allowing the crawler to “remember” previous actions. + - **JavaScript Execution**: Executes JavaScript commands (e.g., clicks, scrolls) to simulate interactions. + - **Wait Conditions**: Ensures the crawler waits for content to load in each state before moving on. + - **Sequential State Transitions**: By defining actions and wait conditions between states, the crawler can navigate through the page as a user would. + +#### **4. Basic Session Example: Multi-Step Content Loading** + - **Goal**: Crawl an article feed that requires several “load more” clicks to display additional content. + - **Code**: + ```python + async def crawl_article_feed(): + async with AsyncWebCrawler() as crawler: + session_id = "feed_session" + + for page in range(3): + result = await crawler.arun( + url="https://example.com/articles", + session_id=session_id, + js_code="document.querySelector('.load-more-button').click();" if page > 0 else None, + wait_for="css:.article", + css_selector=".article" # Target article elements + ) + print(f"Page {page + 1}: Extracted {len(result.extracted_content)} articles") + ``` + - **Explanation**: + - **session_id**: Ensures all requests share the same browsing state. + - **js_code**: Clicks the “load more” button after the initial page load, expanding content on each iteration. + - **wait_for**: Ensures articles have loaded after each click before extraction. + +#### **5. Advanced Example: E-Commerce Product Search with Filter Selection** + - **Goal**: Interact with filters on an e-commerce page to extract products based on selected criteria. + - **Example Steps**: + 1. **State 1**: Load the main product page. + 2. **State 2**: Apply a filter (e.g., “On Sale”) by selecting a checkbox. + 3. **State 3**: Scroll to load additional products and capture updated results. + + - **Code**: + ```python + async def extract_filtered_products(): + async with AsyncWebCrawler() as crawler: + session_id = "product_session" + + # Step 1: Open product page + result = await crawler.arun( + url="https://example.com/products", + session_id=session_id, + wait_for="css:.product-item" + ) + + # Step 2: Apply filter (e.g., "On Sale") + result = await crawler.arun( + url="https://example.com/products", + session_id=session_id, + js_code="document.querySelector('#sale-filter-checkbox').click();", + wait_for="css:.product-item" + ) + + # Step 3: Scroll to load additional products + for _ in range(2): # Scroll down twice + result = await crawler.arun( + url="https://example.com/products", + session_id=session_id, + js_code="window.scrollTo(0, document.body.scrollHeight);", + wait_for="css:.product-item" + ) + print(f"Loaded {len(result.extracted_content)} products after scroll") + ``` + - **Explanation**: + - **State Persistence**: Each action (filter selection and scroll) builds on the previous session state. + - **Multiple Interactions**: Combines clicking a filter with scrolling, demonstrating how the session preserves these actions. + +#### **6. Key Benefits of Session-Based Crawling** + - **Accessing Hidden Content**: Retrieves data that loads only after user actions. + - **Simulating User Behavior**: Handles interactive elements such as “load more” buttons, dropdowns, and filters. + - **Maintaining Continuity Across States**: Enables a sequential process, moving logically from one state to the next, capturing all desired content without reloading the initial state each time. + +#### **7. Additional Configuration Tips** + - **Manage Session End**: Always conclude the session after the final state to release resources. + - **Optimize with Wait Conditions**: Use `wait_for` to ensure complete loading before each extraction. + - **Handling Errors in Session-Based Crawling**: Include error handling for interactions that may fail, ensuring robustness across state transitions. + +#### **8. Complete Code Example: Multi-Step Session Workflow** + - **Example**: + ```python + async def main(): + await crawl_article_feed() + await extract_filtered_products() + + if __name__ == "__main__": + asyncio.run(main()) + ``` + +#### **9. Wrap Up & Next Steps** + - Recap the usefulness of session-based crawling for dynamic content extraction. + - Tease the next video: **Hooks and Custom Workflow with AsyncWebCrawler** to cover advanced customization options for further control over the crawling process. + +--- + +This outline covers session-based crawling from both a conceptual and practical perspective, helping users understand its importance, configure it effectively, and use it to handle complex dynamic content.# Crawl4AI + +## Episode 13: Chunking Strategies for Large Text Processing + +### Quick Intro +Explain Regex, NLP, and Fixed-Length chunking, and when to use each. Demo: Chunk a large article or document for processing by topics or sentences. + +Here’s a structured outline for the **Chunking Strategies for Large Text Processing** video, emphasizing how chunking works within extraction and why it’s crucial for effective data aggregation. + +Here’s a structured outline for the **Chunking Strategies for Large Text Processing** video, explaining each strategy, when to use it, and providing examples to illustrate. + +--- + +### **12. Chunking Strategies for Large Text Processing** + +#### **1. Introduction to Chunking in Crawl4AI** + - **What is Chunking**: Chunking is the process of dividing large text into manageable sections or “chunks,” enabling efficient processing in extraction tasks. + - **Why It’s Needed**: + - When processing large text, feeding it directly into an extraction function (like `F(x)`) can overwhelm memory or token limits. + - Chunking breaks down `x` (the text) into smaller pieces, which are processed sequentially or in parallel by the extraction function, with the final result being an aggregation of all chunks’ processed output. + +#### **2. Key Chunking Strategies and Use Cases** + - Crawl4AI offers various chunking strategies to suit different text structures, chunk sizes, and processing requirements. + - **Choosing a Strategy**: Select based on the type of text (e.g., articles, transcripts) and extraction needs (e.g., simple splitting or context-sensitive processing). + +#### **3. Strategy 1: Regex-Based Chunking** + - **Description**: Uses regular expressions to split text based on specified patterns (e.g., paragraphs or section breaks). + - **Use Case**: Ideal for dividing text by paragraphs or larger logical blocks where sections are clearly separated by line breaks or punctuation. + - **Example**: + - **Pattern**: `r'\n\n'` for double line breaks. + ```python + chunker = RegexChunking(patterns=[r'\n\n']) + text_chunks = chunker.chunk(long_text) + print(text_chunks) # Output: List of paragraphs + ``` + - **Pros**: Flexible for pattern-based chunking. + - **Cons**: Limited to text with consistent formatting. + +#### **4. Strategy 2: NLP Sentence-Based Chunking** + - **Description**: Uses NLP to split text by sentences, ensuring grammatically complete segments. + - **Use Case**: Useful for extracting individual statements, such as in news articles, quotes, or legal text. + - **Example**: + ```python + chunker = NlpSentenceChunking() + sentence_chunks = chunker.chunk(long_text) + print(sentence_chunks) # Output: List of sentences + ``` + - **Pros**: Maintains sentence structure, ideal for tasks needing semantic completeness. + - **Cons**: May create very small chunks, which could limit contextual extraction. + +#### **5. Strategy 3: Topic-Based Segmentation Using TextTiling** + - **Description**: Segments text into topics using TextTiling, identifying topic shifts and key segments. + - **Use Case**: Ideal for long articles, reports, or essays where each section covers a different topic. + - **Example**: + ```python + chunker = TopicSegmentationChunking(num_keywords=3) + topic_chunks = chunker.chunk_with_topics(long_text) + print(topic_chunks) # Output: List of topic segments with keywords + ``` + - **Pros**: Groups related content, preserving topical coherence. + - **Cons**: Depends on identifiable topic shifts, which may not be present in all texts. + +#### **6. Strategy 4: Fixed-Length Word Chunking** + - **Description**: Splits text into chunks based on a fixed number of words. + - **Use Case**: Ideal for text where exact segment size is required, such as processing word-limited documents for LLMs. + - **Example**: + ```python + chunker = FixedLengthWordChunking(chunk_size=100) + word_chunks = chunker.chunk(long_text) + print(word_chunks) # Output: List of 100-word chunks + ``` + - **Pros**: Ensures uniform chunk sizes, suitable for token-based extraction limits. + - **Cons**: May split sentences, affecting semantic coherence. + +#### **7. Strategy 5: Sliding Window Chunking** + - **Description**: Uses a fixed window size with a step, creating overlapping chunks to maintain context. + - **Use Case**: Useful for maintaining context across sections, as with documents where context is needed for neighboring sections. + - **Example**: + ```python + chunker = SlidingWindowChunking(window_size=100, step=50) + window_chunks = chunker.chunk(long_text) + print(window_chunks) # Output: List of overlapping word chunks + ``` + - **Pros**: Retains context across adjacent chunks, ideal for complex semantic extraction. + - **Cons**: Overlap increases data size, potentially impacting processing time. + +#### **8. Strategy 6: Overlapping Window Chunking** + - **Description**: Similar to sliding windows but with a defined overlap, allowing chunks to share content at the edges. + - **Use Case**: Suitable for handling long texts with essential overlapping information, like research articles or medical records. + - **Example**: + ```python + chunker = OverlappingWindowChunking(window_size=1000, overlap=100) + overlap_chunks = chunker.chunk(long_text) + print(overlap_chunks) # Output: List of overlapping chunks with defined overlap + ``` + - **Pros**: Allows controlled overlap for consistent content coverage across chunks. + - **Cons**: Redundant data in overlapping areas may increase computation. + +#### **9. Practical Example: Using Chunking with an Extraction Strategy** + - **Goal**: Combine chunking with an extraction strategy to process large text effectively. + - **Example Code**: + ```python + from crawl4ai.extraction_strategy import LLMExtractionStrategy + + async def extract_large_text(): + # Initialize chunker and extraction strategy + chunker = FixedLengthWordChunking(chunk_size=200) + extraction_strategy = LLMExtractionStrategy(provider="openai/gpt-4", api_token="your_api_token") + + # Split text into chunks + text_chunks = chunker.chunk(large_text) + + async with AsyncWebCrawler() as crawler: + for chunk in text_chunks: + result = await crawler.arun( + url="https://example.com", + extraction_strategy=extraction_strategy, + content=chunk + ) + print(result.extracted_content) + ``` + + - **Explanation**: + - `chunker.chunk()`: Divides the `large_text` into smaller segments based on the chosen strategy. + - `extraction_strategy`: Processes each chunk separately, and results are then aggregated to form the final output. + +#### **10. Choosing the Right Chunking Strategy** + - **Text Structure**: If text has clear sections (e.g., paragraphs, topics), use Regex or Topic Segmentation. + - **Extraction Needs**: If context is crucial, consider Sliding or Overlapping Window Chunking. + - **Processing Constraints**: For word-limited extractions (e.g., LLMs with token limits), Fixed-Length Word Chunking is often most effective. + +#### **11. Wrap Up & Next Steps** + - Recap the benefits of each chunking strategy and when to use them in extraction workflows. + - Tease the next video: **Hooks and Custom Workflow with AsyncWebCrawler**, focusing on customizing crawler behavior with hooks for a fine-tuned extraction process. + +--- + +This outline provides a complete understanding of chunking strategies, explaining each method’s strengths and best-use scenarios to help users process large texts effectively in Crawl4AI.# Crawl4AI + +## Episode 14: Hooks and Custom Workflow with AsyncWebCrawler + +### Quick Intro +Cover hooks (`on_browser_created`, `before_goto`, `after_goto`) to add custom workflows. Demo: Use hooks to add custom cookies or headers, log HTML, or trigger specific events on page load. + +Here’s a detailed outline for the **Hooks and Custom Workflow with AsyncWebCrawler** video, covering each hook’s purpose, usage, and example implementations. + +--- + +### **13. Hooks and Custom Workflow with AsyncWebCrawler** + +#### **1. Introduction to Hooks in Crawl4AI** + - **What are Hooks**: Hooks are customizable entry points in the crawling process that allow users to inject custom actions or logic at specific stages. + - **Why Use Hooks**: + - They enable fine-grained control over the crawling workflow. + - Useful for performing additional tasks (e.g., logging, modifying headers) dynamically during the crawl. + - Hooks provide the flexibility to adapt the crawler to complex site structures or unique project needs. + +#### **2. Overview of Available Hooks** + - Crawl4AI offers seven key hooks to modify and control different stages in the crawling lifecycle: + - `on_browser_created` + - `on_user_agent_updated` + - `on_execution_started` + - `before_goto` + - `after_goto` + - `before_return_html` + - `before_retrieve_html` + +#### **3. Hook-by-Hook Explanation and Examples** + +--- + +##### **Hook 1: `on_browser_created`** + - **Purpose**: Triggered right after the browser instance is created. + - **Use Case**: + - Initializing browser-specific settings or performing setup actions. + - Configuring browser extensions or scripts before any page is opened. + - **Example**: + ```python + async def log_browser_creation(browser): + print("Browser instance created:", browser) + + crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation) + ``` + - **Explanation**: This hook logs the browser creation event, useful for tracking when a new browser instance starts. + +--- + +##### **Hook 2: `on_user_agent_updated`** + - **Purpose**: Called whenever the user agent string is updated. + - **Use Case**: + - Modifying the user agent based on page requirements, e.g., changing to a mobile user agent for mobile-only pages. + - **Example**: + ```python + def update_user_agent(user_agent): + print(f"User Agent Updated: {user_agent}") + + crawler.crawler_strategy.set_hook('on_user_agent_updated', update_user_agent) + crawler.update_user_agent("Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X)") + ``` + - **Explanation**: This hook provides a callback every time the user agent changes, helpful for debugging or dynamically altering user agent settings based on conditions. + +--- + +##### **Hook 3: `on_execution_started`** + - **Purpose**: Called right before the crawler begins any interaction (e.g., JavaScript execution, clicks). + - **Use Case**: + - Performing setup actions, such as inserting cookies or initiating custom scripts. + - **Example**: + ```python + async def log_execution_start(page): + print("Execution started on page:", page.url) + + crawler.crawler_strategy.set_hook('on_execution_started', log_execution_start) + ``` + - **Explanation**: Logs the start of any major interaction on the page, ideal for cases where you want to monitor each interaction. + +--- + +##### **Hook 4: `before_goto`** + - **Purpose**: Triggered before navigating to a new URL with `page.goto()`. + - **Use Case**: + - Modifying request headers or setting up conditions right before the page loads. + - Adding headers or dynamically adjusting options for specific URLs. + - **Example**: + ```python + async def modify_headers_before_goto(page): + await page.set_extra_http_headers({"X-Custom-Header": "CustomValue"}) + print("Custom headers set before navigation") + + crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto) + ``` + - **Explanation**: This hook allows injecting headers or altering settings based on the page’s needs, particularly useful for pages with custom requirements. + +--- + +##### **Hook 5: `after_goto`** + - **Purpose**: Executed immediately after a page has loaded (after `page.goto()`). + - **Use Case**: + - Checking the loaded page state, modifying the DOM, or performing post-navigation actions (e.g., scrolling). + - **Example**: + ```python + async def post_navigation_scroll(page): + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + print("Scrolled to the bottom after navigation") + + crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll) + ``` + - **Explanation**: This hook scrolls to the bottom of the page after loading, which can help load dynamically added content like infinite scroll elements. + +--- + +##### **Hook 6: `before_return_html`** + - **Purpose**: Called right before HTML content is retrieved and returned. + - **Use Case**: + - Removing overlays or cleaning up the page for a cleaner HTML extraction. + - **Example**: + ```python + async def remove_advertisements(page, html): + await page.evaluate("document.querySelectorAll('.ad-banner').forEach(el => el.remove());") + print("Advertisements removed before returning HTML") + + crawler.crawler_strategy.set_hook('before_return_html', remove_advertisements) + ``` + - **Explanation**: The hook removes ad banners from the HTML before it’s retrieved, ensuring a cleaner data extraction. + +--- + +##### **Hook 7: `before_retrieve_html`** + - **Purpose**: Runs right before Crawl4AI initiates HTML retrieval. + - **Use Case**: + - Finalizing any page adjustments (e.g., setting timers, waiting for specific elements). + - **Example**: + ```python + async def wait_for_content_before_retrieve(page): + await page.wait_for_selector('.main-content') + print("Main content loaded, ready to retrieve HTML") + + crawler.crawler_strategy.set_hook('before_retrieve_html', wait_for_content_before_retrieve) + ``` + - **Explanation**: This hook waits for the main content to load before retrieving the HTML, ensuring that all essential content is captured. + +#### **4. Setting Hooks in Crawl4AI** + - **How to Set Hooks**: + - Use `set_hook` to define a custom function for each hook. + - Each hook function can be asynchronous (useful for actions like waiting or retrieving async data). + - **Example Setup**: + ```python + crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation) + crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto) + crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll) + ``` + +#### **5. Complete Example: Using Hooks for a Customized Crawl Workflow** + - **Goal**: Log each key step, set custom headers before navigation, and clean up the page before retrieving HTML. + - **Example Code**: + ```python + async def custom_crawl(): + async with AsyncWebCrawler() as crawler: + # Set hooks for custom workflow + crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation) + crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto) + crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll) + crawler.crawler_strategy.set_hook('before_return_html', remove_advertisements) + + # Perform the crawl + url = "https://example.com" + result = await crawler.arun(url=url) + print(result.html) # Display or process HTML + ``` + +#### **6. Benefits of Using Hooks in Custom Crawling Workflows** + - **Enhanced Control**: Hooks offer precise control over each stage, allowing adjustments based on content and structure. + - **Efficient Modifications**: Avoid reloading or restarting the session; hooks can alter actions dynamically. + - **Context-Sensitive Actions**: Hooks enable custom logic tailored to specific pages or sections, maximizing extraction quality. + +#### **7. Wrap Up & Next Steps** + - Recap how hooks empower customized workflows in Crawl4AI, enabling flexibility at every stage. + - Tease the next video: **Automating Post-Processing with Crawl4AI**, covering automated steps after data extraction. + +--- + +This outline provides a thorough understanding of hooks, their practical applications, and examples for customizing the crawling workflow in Crawl4AI. \ No newline at end of file diff --git a/docs/md_v3/tutorials/advanced-features.md b/docs/md_v3/tutorials/advanced-features.md new file mode 100644 index 0000000000000000000000000000000000000000..16f85874b41ac79944549ff3115943bfc6c7a843 --- /dev/null +++ b/docs/md_v3/tutorials/advanced-features.md @@ -0,0 +1,329 @@ +# Advanced Features (Proxy, PDF, Screenshot, SSL, Headers, & Storage State) + +Crawl4AI offers multiple power-user features that go beyond simple crawling. This tutorial covers: + +1. **Proxy Usage** +2. **Capturing PDFs & Screenshots** +3. **Handling SSL Certificates** +4. **Custom Headers** +5. **Session Persistence & Local Storage** + +> **Prerequisites** +> - You have a basic grasp of [AsyncWebCrawler Basics](./async-webcrawler-basics.md) +> - You know how to run or configure your Python environment with Playwright installed + +--- + +## 1. Proxy Usage + +If you need to route your crawl traffic through a proxy—whether for IP rotation, geo-testing, or privacy—Crawl4AI supports it via `BrowserConfig.proxy_config`. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + browser_cfg = BrowserConfig( + proxy_config={ + "server": "http://proxy.example.com:8080", + "username": "myuser", + "password": "mypass", + }, + headless=True + ) + crawler_cfg = CrawlerRunConfig( + verbose=True + ) + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun( + url="https://www.whatismyip.com/", + config=crawler_cfg + ) + if result.success: + print("[OK] Page fetched via proxy.") + print("Page HTML snippet:", result.html[:200]) + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key Points** +- **`proxy_config`** expects a dict with `server` and optional auth credentials. +- Many commercial proxies provide an HTTP/HTTPS “gateway” server that you specify in `server`. +- If your proxy doesn’t need auth, omit `username`/`password`. + +--- + +## 2. Capturing PDFs & Screenshots + +Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI can do both in one pass: + +```python +import os, asyncio +from base64 import b64decode +from crawl4ai import AsyncWebCrawler, CacheMode + +async def main(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://en.wikipedia.org/wiki/List_of_common_misconceptions", + cache_mode=CacheMode.BYPASS, + pdf=True, + screenshot=True + ) + + if result.success: + # Save screenshot + if result.screenshot: + with open("wikipedia_screenshot.png", "wb") as f: + f.write(b64decode(result.screenshot)) + + # Save PDF + if result.pdf: + with open("wikipedia_page.pdf", "wb") as f: + f.write(b64decode(result.pdf)) + + print("[OK] PDF & screenshot captured.") + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Why PDF + Screenshot?** +- Large or complex pages can be slow or error-prone with “traditional” full-page screenshots. +- Exporting a PDF is more reliable for very long pages. Crawl4AI automatically converts the first PDF page into an image if you request both. + +**Relevant Parameters** +- **`pdf=True`**: Exports the current page as a PDF (base64-encoded in `result.pdf`). +- **`screenshot=True`**: Creates a screenshot (base64-encoded in `result.screenshot`). +- **`scan_full_page`** or advanced hooking can further refine how the crawler captures content. + +--- + +## 3. Handling SSL Certificates + +If you need to verify or export a site’s SSL certificate—for compliance, debugging, or data analysis—Crawl4AI can fetch it during the crawl: + +```python +import asyncio, os +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +async def main(): + tmp_dir = os.path.join(os.getcwd(), "tmp") + os.makedirs(tmp_dir, exist_ok=True) + + config = CrawlerRunConfig( + fetch_ssl_certificate=True, + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", config=config) + + if result.success and result.ssl_certificate: + cert = result.ssl_certificate + print("\nCertificate Information:") + print(f"Issuer (CN): {cert.issuer.get('CN', '')}") + print(f"Valid until: {cert.valid_until}") + print(f"Fingerprint: {cert.fingerprint}") + + # Export in multiple formats: + cert.to_json(os.path.join(tmp_dir, "certificate.json")) + cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) + cert.to_der(os.path.join(tmp_dir, "certificate.der")) + + print("\nCertificate exported to JSON/PEM/DER in 'tmp' folder.") + else: + print("[ERROR] No certificate or crawl failed.") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key Points** +- **`fetch_ssl_certificate=True`** triggers certificate retrieval. +- `result.ssl_certificate` includes methods (`to_json`, `to_pem`, `to_der`) for saving in various formats (handy for server config, Java keystores, etc.). + +--- + +## 4. Custom Headers + +Sometimes you need to set custom headers (e.g., language preferences, authentication tokens, or specialized user-agent strings). You can do this in multiple ways: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + # Option 1: Set headers at the crawler strategy level + crawler1 = AsyncWebCrawler( + # The underlying strategy can accept headers in its constructor + crawler_strategy=None # We'll override below for clarity + ) + crawler1.crawler_strategy.update_user_agent("MyCustomUA/1.0") + crawler1.crawler_strategy.set_custom_headers({ + "Accept-Language": "fr-FR,fr;q=0.9" + }) + result1 = await crawler1.arun("https://www.example.com") + print("Example 1 result success:", result1.success) + + # Option 2: Pass headers directly to `arun()` + crawler2 = AsyncWebCrawler() + result2 = await crawler2.arun( + url="https://www.example.com", + headers={"Accept-Language": "es-ES,es;q=0.9"} + ) + print("Example 2 result success:", result2.success) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Notes** +- Some sites may react differently to certain headers (e.g., `Accept-Language`). +- If you need advanced user-agent randomization or client hints, see [Identity-Based Crawling (Anti-Bot)](./identity-anti-bot.md) or use `UserAgentGenerator`. + +--- + +## 5. Session Persistence & Local Storage + +Crawl4AI can preserve cookies and localStorage so you can continue where you left off—ideal for logging into sites or skipping repeated auth flows. + +### 5.1 `storage_state` + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + storage_dict = { + "cookies": [ + { + "name": "session", + "value": "abcd1234", + "domain": "example.com", + "path": "/", + "expires": 1699999999.0, + "httpOnly": False, + "secure": False, + "sameSite": "None" + } + ], + "origins": [ + { + "origin": "https://example.com", + "localStorage": [ + {"name": "token", "value": "my_auth_token"} + ] + } + ] + } + + # Provide the storage state as a dictionary to start "already logged in" + async with AsyncWebCrawler( + headless=True, + storage_state=storage_dict + ) as crawler: + result = await crawler.arun("https://example.com/protected") + if result.success: + print("Protected page content length:", len(result.html)) + else: + print("Failed to crawl protected page") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### 5.2 Exporting & Reusing State + +You can sign in once, export the browser context, and reuse it later—without re-entering credentials. + +- **`await context.storage_state(path="my_storage.json")`**: Exports cookies, localStorage, etc. to a file. +- Provide `storage_state="my_storage.json"` on subsequent runs to skip the login step. + +**See**: [Detailed session management tutorial](./hooks-custom.md#using-storage_state) or [Explanations → Browser Context & Managed Browser](../../explanations/browser-management.md) for more advanced scenarios (like multi-step logins, or capturing after interactive pages). + +--- + +## Putting It All Together + +Here’s a snippet that combines multiple “advanced” features (proxy, PDF, screenshot, SSL, custom headers, and session reuse) into one run. Normally, you’d tailor each setting to your project’s needs. + +```python +import os, asyncio +from base64 import b64decode +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + +async def main(): + # 1. Browser config with proxy + headless + browser_cfg = BrowserConfig( + proxy_config={ + "server": "http://proxy.example.com:8080", + "username": "myuser", + "password": "mypass", + }, + headless=True, + ) + + # 2. Crawler config with PDF, screenshot, SSL, custom headers, and ignoring caches + crawler_cfg = CrawlerRunConfig( + pdf=True, + screenshot=True, + fetch_ssl_certificate=True, + cache_mode=CacheMode.BYPASS, + headers={"Accept-Language": "en-US,en;q=0.8"}, + storage_state="my_storage.json", # Reuse session from a previous sign-in + verbose=True, + ) + + # 3. Crawl + async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun("https://secure.example.com/protected", config=crawler_cfg) + + if result.success: + print("[OK] Crawled the secure page. Links found:", len(result.links.get("internal", []))) + + # Save PDF & screenshot + if result.pdf: + with open("result.pdf", "wb") as f: + f.write(b64decode(result.pdf)) + if result.screenshot: + with open("result.png", "wb") as f: + f.write(b64decode(result.screenshot)) + + # Check SSL cert + if result.ssl_certificate: + print("SSL Issuer CN:", result.ssl_certificate.issuer.get("CN", "")) + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +## Conclusion & Next Steps + +You’ve now explored several **advanced** features: + +- **Proxy Usage** +- **PDF & Screenshot** capturing for large or critical pages +- **SSL Certificate** retrieval & exporting +- **Custom Headers** for language or specialized requests +- **Session Persistence** via storage state + +**Where to go next**: + +- **[Hooks & Custom Code](./hooks-custom.md)**: For multi-step interactions (clicking “Load More,” performing logins, etc.) +- **[Identity-Based Crawling & Anti-Bot](./identity-anti-bot.md)**: If you need more sophisticated user simulation or stealth. +- **[Reference → BrowserConfig & CrawlerRunConfig](../../reference/configuration.md)**: Detailed param descriptions for everything you’ve seen here and more. + +With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline. + +**Last Updated**: 2024-XX-XX \ No newline at end of file diff --git a/docs/md_v3/tutorials/async-webcrawler-basics.md b/docs/md_v3/tutorials/async-webcrawler-basics.md new file mode 100644 index 0000000000000000000000000000000000000000..6236d89924ddb7f471f0b5851b2698b6fe2750b8 --- /dev/null +++ b/docs/md_v3/tutorials/async-webcrawler-basics.md @@ -0,0 +1,235 @@ +Below is a sample Markdown file (`tutorials/async-webcrawler-basics.md`) illustrating how you might teach new users the fundamentals of `AsyncWebCrawler`. This tutorial builds on the **Getting Started** section by introducing key configuration parameters and the structure of the crawl result. Feel free to adjust the code snippets, wording, or format to match your style. + +--- + +# AsyncWebCrawler Basics + +In this tutorial, you’ll learn how to: + +1. Create and configure an `AsyncWebCrawler` instance +2. Understand the `CrawlResult` object returned by `arun()` +3. Use basic `BrowserConfig` and `CrawlerRunConfig` options to tailor your crawl + +> **Prerequisites** +> - You’ve already completed the [Getting Started](./getting-started.md) tutorial (or have equivalent knowledge). +> - You have **Crawl4AI** installed and configured with Playwright. + +--- + +## 1. What is `AsyncWebCrawler`? + +`AsyncWebCrawler` is the central class for running asynchronous crawling operations in Crawl4AI. It manages browser sessions, handles dynamic pages (if needed), and provides you with a structured result object for each crawl. Essentially, it’s your high-level interface for collecting page data. + +```python +from crawl4ai import AsyncWebCrawler + +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com") + print(result) +``` + +--- + +## 2. Creating a Basic `AsyncWebCrawler` Instance + +Below is a simple code snippet showing how to create and use `AsyncWebCrawler`. This goes one step beyond the minimal example you saw in [Getting Started](./getting-started.md). + +```python +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai import BrowserConfig, CrawlerRunConfig + +async def main(): + # 1. Set up configuration objects (optional if you want defaults) + browser_config = BrowserConfig( + browser_type="chromium", + headless=True, + verbose=True + ) + crawler_config = CrawlerRunConfig( + page_timeout=30000, # 30 seconds + wait_for_images=True, + verbose=True + ) + + # 2. Initialize AsyncWebCrawler with your chosen browser config + async with AsyncWebCrawler(config=browser_config) as crawler: + # 3. Run a single crawl + url_to_crawl = "https://example.com" + result = await crawler.arun(url=url_to_crawl, config=crawler_config) + + # 4. Inspect the result + if result.success: + print(f"Successfully crawled: {result.url}") + print(f"HTML length: {len(result.html)}") + print(f"Markdown snippet: {result.markdown[:200]}...") + else: + print(f"Failed to crawl {result.url}. Error: {result.error_message}") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### Key Points + +1. **`BrowserConfig`** is optional, but it’s the place to specify browser-related settings (e.g., `headless`, `browser_type`). +2. **`CrawlerRunConfig`** deals with how you want the crawler to behave for this particular run (timeouts, waiting for images, etc.). +3. **`arun()`** is the main method to crawl a single URL. We’ll see how `arun_many()` works in later tutorials. + +--- + +## 3. Understanding `CrawlResult` + +When you call `arun()`, you get back a `CrawlResult` object containing all the relevant data from that crawl attempt. Some common fields include: + +```python +class CrawlResult(BaseModel): + url: str + html: str + success: bool + cleaned_html: Optional[str] = None + media: Dict[str, List[Dict]] = {} + links: Dict[str, List[Dict]] = {} + screenshot: Optional[str] = None # base64-encoded screenshot if requested + pdf: Optional[bytes] = None # binary PDF data if requested + markdown: Optional[Union[str, MarkdownGenerationResult]] = None + markdown_v2: Optional[MarkdownGenerationResult] = None + error_message: Optional[str] = None + # ... plus other fields like status_code, ssl_certificate, extracted_content, etc. +``` + +### Commonly Used Fields + +- **`success`**: `True` if the crawl succeeded, `False` otherwise. +- **`html`**: The raw HTML (or final rendered state if JavaScript was executed). +- **`markdown` / `markdown_v2`**: Contains the automatically generated Markdown representation of the page. +- **`media`**: A dictionary with lists of extracted images, videos, or audio elements. +- **`links`**: A dictionary with lists of “internal” and “external” link objects. +- **`error_message`**: If `success` is `False`, this often contains a description of the error. + +**Example**: + +```python +if result.success: + print("Page Title or snippet of HTML:", result.html[:200]) + if result.markdown: + print("Markdown snippet:", result.markdown[:200]) + print("Links found:", len(result.links.get("internal", [])), "internal links") +else: + print("Error crawling:", result.error_message) +``` + +--- + +## 4. Relevant Basic Parameters + +Below are a few `BrowserConfig` and `CrawlerRunConfig` parameters you might tweak early on. We’ll cover more advanced ones (like proxies, PDF, or screenshots) in later tutorials. + +### 4.1 `BrowserConfig` Essentials + +| Parameter | Description | Default | +|--------------------|-----------------------------------------------------------|----------------| +| `browser_type` | Which browser engine to use: `"chromium"`, `"firefox"`, `"webkit"` | `"chromium"` | +| `headless` | Run the browser with no UI window. If `False`, you see the browser. | `True` | +| `verbose` | Print extra logs for debugging. | `True` | +| `java_script_enabled` | Toggle JavaScript. When `False`, you might speed up loads but lose dynamic content. | `True` | + +### 4.2 `CrawlerRunConfig` Essentials + +| Parameter | Description | Default | +|-----------------------|--------------------------------------------------------------|--------------------| +| `page_timeout` | Maximum time in ms to wait for the page to load or scripts. | `30000` (30s) | +| `wait_for_images` | Wait for images to fully load. Good for accurate rendering. | `True` | +| `css_selector` | Target only certain elements for extraction. | `None` | +| `excluded_tags` | Skip certain HTML tags (like `nav`, `footer`, etc.) | `None` | +| `verbose` | Print logs for debugging. | `True` | + +> **Tip**: Don’t worry if you see lots of parameters. You’ll learn them gradually in later tutorials. + +--- + +## 5. Windows-Specific Configuration + +When using AsyncWebCrawler on Windows, you might encounter a `NotImplementedError` related to `asyncio.create_subprocess_exec`. This is a known Windows-specific issue that occurs because Windows' default event loop doesn't support subprocess operations. + +To resolve this, Crawl4AI provides a utility function to configure Windows to use the ProactorEventLoop. Call this function before running any async operations: + +```python +from crawl4ai.utils import configure_windows_event_loop + +# Call this before any async operations if you're on Windows +configure_windows_event_loop() + +# Your AsyncWebCrawler code here +``` + +--- + +## 6. Putting It All Together + +Here’s a slightly more in-depth example that shows off a few key config parameters at once: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai import BrowserConfig, CrawlerRunConfig + +async def main(): + browser_cfg = BrowserConfig( + browser_type="chromium", + headless=True, + java_script_enabled=True, + verbose=False + ) + + crawler_cfg = CrawlerRunConfig( + page_timeout=30000, # wait up to 30 seconds + wait_for_images=True, + css_selector=".article-body", # only extract content under this CSS selector + verbose=True + ) + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun("https://news.example.com", config=crawler_cfg) + + if result.success: + print("[OK] Crawled:", result.url) + print("HTML length:", len(result.html)) + print("Extracted Markdown:", result.markdown_v2.raw_markdown[:300]) + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key Observations**: +- `css_selector=".article-body"` ensures we only focus on the main content region. +- `page_timeout=30000` helps if the site is slow. +- We turned off `verbose` logs for the browser but kept them on for the crawler config. + +--- + +## 7. Next Steps + +- **Smart Crawling Techniques**: Learn to handle iframes, advanced caching, and selective extraction in the [next tutorial](./smart-crawling.md). +- **Hooks & Custom Code**: See how to inject custom logic before and after navigation in a dedicated [Hooks Tutorial](./hooks-custom.md). +- **Reference**: For a complete list of every parameter in `BrowserConfig` and `CrawlerRunConfig`, check out the [Reference section](../../reference/configuration.md). + +--- + +## Summary + +You now know the basics of **AsyncWebCrawler**: +- How to create it with optional browser/crawler configs +- How `arun()` works for single-page crawls +- Where to find your crawled data in `CrawlResult` +- A handful of frequently used configuration parameters + +From here, you can refine your crawler to handle more advanced scenarios, like focusing on specific content or dealing with dynamic elements. Let’s move on to **[Smart Crawling Techniques](./smart-crawling.md)** to learn how to handle iframes, advanced caching, and more. + +--- + +**Last updated**: 2024-XX-XX + +Keep exploring! If you get stuck, remember to check out the [How-To Guides](../../how-to/) for targeted solutions or the [Explanations](../../explanations/) for deeper conceptual background. \ No newline at end of file diff --git a/docs/md_v3/tutorials/docker-quickstart.md b/docs/md_v3/tutorials/docker-quickstart.md new file mode 100644 index 0000000000000000000000000000000000000000..73070baa23b8c2b63d0d741b41498d5b84f867df --- /dev/null +++ b/docs/md_v3/tutorials/docker-quickstart.md @@ -0,0 +1,271 @@ +# Deploying with Docker (Quickstart) + +> **⚠️ WARNING: Experimental & Legacy** +> Our current Docker solution for Crawl4AI is **not stable** and **will be discontinued** soon. A more robust Docker/Orchestration strategy is in development, with a planned stable release in **2025**. If you choose to use this Docker approach, please proceed cautiously and avoid production deployment without thorough testing. + +Crawl4AI is **open-source** and under **active development**. We appreciate your interest, but strongly recommend you make **informed decisions** if you need a production environment. Expect breaking changes in future versions. + +--- + +## 1. Installation & Environment Setup (Outside Docker) + +Before we jump into Docker usage, here’s a quick reminder of how to install Crawl4AI locally (legacy doc). For **non-Docker** deployments or local dev: + +```bash +# 1. Install the package +pip install crawl4ai +crawl4ai-setup + +# 2. Install playwright dependencies (all browsers or specific ones) +playwright install --with-deps +# or +playwright install --with-deps chromium +# or +playwright install --with-deps chrome +``` + +**Testing** your installation: + +```bash +# Visible browser test +python -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); browser = p.chromium.launch(headless=False); page = browser.new_page(); page.goto('https://example.com'); input('Press Enter to close...')" +``` + +--- + +## 2. Docker Overview + +This Docker approach allows you to run a **Crawl4AI** service via REST API. You can: + +1. **POST** a request (e.g., URLs, extraction config) +2. **Retrieve** your results from a task-based endpoint + +> **Note**: This Docker solution is **temporary**. We plan a more robust, stable Docker approach in the near future. For now, you can experiment, but do not rely on it for mission-critical production. + +--- + +## 3. Pulling and Running the Image + +### Basic Run + +```bash +docker pull unclecode/crawl4ai:basic +docker run -p 11235:11235 unclecode/crawl4ai:basic +``` + +This starts a container on port `11235`. You can `POST` requests to `http://localhost:11235/crawl`. + +### Using an API Token + +```bash +docker run -p 11235:11235 \ + -e CRAWL4AI_API_TOKEN=your_secret_token \ + unclecode/crawl4ai:basic +``` + +If **`CRAWL4AI_API_TOKEN`** is set, you must include `Authorization: Bearer ` in your requests. Otherwise, the service is open to anyone. + +--- + +## 4. Docker Compose for Multi-Container Workflows + +You can also use **Docker Compose** to manage multiple services. Below is an **experimental** snippet: + +```yaml +version: '3.8' + +services: + crawl4ai: + image: unclecode/crawl4ai:basic + ports: + - "11235:11235" + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + # Additional env variables as needed + volumes: + - /dev/shm:/dev/shm +``` + +To run: + +```bash +docker-compose up -d +``` + +And to stop: + +```bash +docker-compose down +``` + +**Troubleshooting**: + +- **Check logs**: `docker-compose logs -f crawl4ai` +- **Remove orphan containers**: `docker-compose down --remove-orphans` +- **Remove networks**: `docker network rm ` + +--- + +## 5. Making Requests to the Container + +**Base URL**: `http://localhost:11235` + +### Example: Basic Crawl + +```python +import requests + +task_request = { + "urls": "https://example.com", + "priority": 10 +} + +response = requests.post("http://localhost:11235/crawl", json=task_request) +task_id = response.json()["task_id"] + +# Poll for status +status_url = f"http://localhost:11235/task/{task_id}" +status = requests.get(status_url).json() +print(status) +``` + +If you used an API token, do: + +```python +headers = {"Authorization": "Bearer your_secret_token"} +response = requests.post( + "http://localhost:11235/crawl", + headers=headers, + json=task_request +) +``` + +--- + +## 6. Docker + New Crawler Config Approach + +### Using `BrowserConfig` & `CrawlerRunConfig` in Requests + +The Docker-based solution can accept **crawler configurations** in the request JSON (legacy doc might show direct parameters, but we want to embed them in `crawler_params` or `extra` to align with the new approach). For example: + +```python +import requests + +request_data = { + "urls": "https://www.nbcnews.com/business", + "crawler_params": { + "headless": True, + "browser_type": "chromium", + "verbose": True, + "page_timeout": 30000, + # ... any other BrowserConfig-like fields + }, + "extra": { + "word_count_threshold": 50, + "bypass_cache": True + } +} + +response = requests.post("http://localhost:11235/crawl", json=request_data) +task_id = response.json()["task_id"] +``` + +This is the recommended style if you want to replicate `BrowserConfig` and `CrawlerRunConfig` settings in Docker mode. + +--- + +## 7. Example: JSON Extraction in Docker + +```python +import requests +import json + +# Define a schema for CSS extraction +schema = { + "name": "Coinbase Crypto Prices", + "baseSelector": ".cds-tableRow-t45thuk", + "fields": [ + { + "name": "crypto", + "selector": "td:nth-child(1) h2", + "type": "text" + }, + { + "name": "symbol", + "selector": "td:nth-child(1) p", + "type": "text" + }, + { + "name": "price", + "selector": "td:nth-child(2)", + "type": "text" + } + ] +} + +request_data = { + "urls": "https://www.coinbase.com/explore", + "extraction_config": { + "type": "json_css", + "params": {"schema": schema} + }, + "crawler_params": { + "headless": True, + "verbose": True + } +} + +resp = requests.post("http://localhost:11235/crawl", json=request_data) +task_id = resp.json()["task_id"] + +# Poll for status +status = requests.get(f"http://localhost:11235/task/{task_id}").json() +if status["status"] == "completed": + extracted_content = status["result"]["extracted_content"] + data = json.loads(extracted_content) + print("Extracted:", len(data), "entries") +else: + print("Task still in progress or failed.") +``` + +--- + +## 8. Why This Docker Is Temporary + +**We are building a new, stable approach**: + +- The current Docker container is **experimental** and might break with future releases. +- We plan a stable release in **2025** with a more robust API, versioning, and orchestration. +- If you use this Docker in production, do so at your own risk and be prepared for **breaking changes**. + +**Community**: Because Crawl4AI is open-source, you can track progress or contribute to the new Docker approach. Check the [GitHub repository](https://github.com/unclecode/crawl4ai) for roadmaps and updates. + +--- + +## 9. Known Limitations & Next Steps + +1. **Not Production-Ready**: This Docker approach lacks extensive security, logging, or advanced config for large-scale usage. +2. **Ongoing Changes**: Expect API changes. The official stable version is targeted for **2025**. +3. **LLM Integrations**: Docker images are big if you want GPU or multiple model providers. We might unify these in a future build. +4. **Performance**: For concurrency or large crawls, you may need to tune resources (memory, CPU) and watch out for ephemeral storage. +5. **Version Pinning**: If you must deploy, pin your Docker tag to a specific version (e.g., `:basic-0.3.7`) to avoid surprise updates. + +### Next Steps + +- **Watch the Repository**: For announcements on the new Docker architecture. +- **Experiment**: Use this Docker for test or dev environments, but keep an eye out for breakage. +- **Contribute**: If you have ideas or improvements, open a PR or discussion. +- **Check Roadmaps**: See our [GitHub issues](https://github.com/unclecode/crawl4ai/issues) or [Roadmap doc](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md) to find upcoming releases. + +--- + +## 10. Summary + +**Deploying with Docker** can simplify running Crawl4AI as a service. However: + +- **This Docker** approach is **legacy** and subject to removal/overhaul. +- For production, please weigh the risks carefully. +- Detailed “new Docker approach” is coming in **2025**. + +We hope this guide helps you do a quick spin-up of Crawl4AI in Docker for **experimental** usage. Stay tuned for the fully-supported version! \ No newline at end of file diff --git a/docs/md_v3/tutorials/getting-started.md b/docs/md_v3/tutorials/getting-started.md new file mode 100644 index 0000000000000000000000000000000000000000..b148e6e1e53ce399087da5b32644216fe674315b --- /dev/null +++ b/docs/md_v3/tutorials/getting-started.md @@ -0,0 +1,272 @@ +# Getting Started with Crawl4AI + +Welcome to **Crawl4AI**, an open-source LLM friendly Web Crawler & Scraper. In this tutorial, you’ll: + +1. **Install** Crawl4AI (both via pip and Docker, with notes on platform challenges). +2. Run your **first crawl** using minimal configuration. +3. Generate **Markdown** output (and learn how it’s influenced by content filters). +4. Experiment with a simple **CSS-based extraction** strategy. +5. See a glimpse of **LLM-based extraction** (including open-source and closed-source model options). + +--- + +## 1. Introduction + +Crawl4AI provides: +- An asynchronous crawler, **`AsyncWebCrawler`**. +- Configurable browser and run settings via **`BrowserConfig`** and **`CrawlerRunConfig`**. +- Automatic HTML-to-Markdown conversion via **`DefaultMarkdownGenerator`** (supports additional filters). +- Multiple extraction strategies (LLM-based or “traditional” CSS/XPath-based). + +By the end of this guide, you’ll have installed Crawl4AI, performed a basic crawl, generated Markdown, and tried out two extraction strategies. + +--- + +## 2. Installation + +### 2.1 Python + Playwright + +#### Basic Pip Installation + +```bash +pip install crawl4ai +crawl4ai-setup + +# Verify your installation +crawl4ai-doctor +``` + +If you encounter any browser-related issues, you can install them manually: +```bash +python -m playwright install --with-deps chrome chromium +``` + +- **`crawl4ai-setup`** installs and configures Playwright (Chromium by default). + +We cover advanced installation and Docker in the [Installation](#installation) section. + +--- + +## 3. Your First Crawl + +Here’s a minimal Python script that creates an **`AsyncWebCrawler`**, fetches a webpage, and prints the first 300 characters of its Markdown output: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com") + print(result.markdown[:300]) # Print first 300 chars + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**What’s happening?** +- **`AsyncWebCrawler`** launches a headless browser (Chromium by default). +- It fetches `https://example.com`. +- Crawl4AI automatically converts the HTML into Markdown. + +You now have a simple, working crawl! + +--- + +## 4. Basic Configuration (Light Introduction) + +Crawl4AI’s crawler can be heavily customized using two main classes: + +1. **`BrowserConfig`**: Controls browser behavior (headless or full UI, user agent, JavaScript toggles, etc.). +2. **`CrawlerRunConfig`**: Controls how each crawl runs (caching, extraction, timeouts, hooking, etc.). + +Below is an example with minimal usage: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + browser_conf = BrowserConfig(headless=True) # or False to see the browser + run_conf = CrawlerRunConfig(cache_mode="BYPASS") + + async with AsyncWebCrawler(config=browser_conf) as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_conf + ) + print(result.markdown) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling. + +--- + +## 5. Generating Markdown Output + +By default, Crawl4AI automatically generates Markdown from each crawled page. However, the exact output depends on whether you specify a **markdown generator** or **content filter**. + +- **`result.markdown`**: + The direct HTML-to-Markdown conversion. +- **`result.markdown.fit_markdown`**: + The same content after applying any configured **content filter** (e.g., `PruningContentFilter`). + +### Example: Using a Filter with `DefaultMarkdownGenerator` + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +md_generator = DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed") +) + +config = CrawlerRunConfig(markdown_generator=md_generator) + +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://news.ycombinator.com", config=config) + print("Raw Markdown length:", len(result.markdown.raw_markdown)) + print("Fit Markdown length:", len(result.markdown.fit_markdown)) +``` + +**Note**: If you do **not** specify a content filter or markdown generator, you’ll typically see only the raw Markdown. We’ll dive deeper into these strategies in a dedicated **Markdown Generation** tutorial. + +--- + +## 6. Simple Data Extraction (CSS-based) + +Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. Below is a minimal CSS-based example: + +```python +import asyncio +import json +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +async def main(): + schema = { + "name": "Example Items", + "baseSelector": "div.item", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"} + ] + } + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/items", + config=CrawlerRunConfig( + extraction_strategy=JsonCssExtractionStrategy(schema) + ) + ) + # The JSON output is stored in 'extracted_content' + data = json.loads(result.extracted_content) + print(data) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Why is this helpful?** +- Great for repetitive page structures (e.g., item listings, articles). +- No AI usage or costs. +- The crawler returns a JSON string you can parse or store. + +--- + +## 7. Simple Data Extraction (LLM-based) + +For more complex or irregular pages, a language model can parse text intelligently into a structure you define. Crawl4AI supports **open-source** or **closed-source** providers: + +- **Open-Source Models** (e.g., `ollama/llama3.3`, `no_token`) +- **OpenAI Models** (e.g., `openai/gpt-4`, requires `api_token`) +- Or any provider supported by the underlying library + +Below is an example using **open-source** style (no token) and closed-source: + +```python +import os +import json +import asyncio +from pydantic import BaseModel, Field +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +class PricingInfo(BaseModel): + model_name: str = Field(..., description="Name of the AI model") + input_fee: str = Field(..., description="Fee for input tokens") + output_fee: str = Field(..., description="Fee for output tokens") + +async def main(): + # 1) Open-Source usage: no token required + llm_strategy_open_source = LLMExtractionStrategy( + provider="ollama/llama3.3", # or "any-other-local-model" + api_token="no_token", # for local models, no API key is typically required + schema=PricingInfo.schema(), + extraction_type="schema", + instruction=""" + From this page, extract all AI model pricing details in JSON format. + Each entry should have 'model_name', 'input_fee', and 'output_fee'. + """, + temperature=0 + ) + + # 2) Closed-Source usage: API key for OpenAI, for example + openai_token = os.getenv("OPENAI_API_KEY", "sk-YOUR_API_KEY") + llm_strategy_openai = LLMExtractionStrategy( + provider="openai/gpt-4", + api_token=openai_token, + schema=PricingInfo.schema(), + extraction_type="schema", + instruction=""" + From this page, extract all AI model pricing details in JSON format. + Each entry should have 'model_name', 'input_fee', and 'output_fee'. + """, + temperature=0 + ) + + # We'll demo the open-source approach here + config = CrawlerRunConfig(extraction_strategy=llm_strategy_open_source) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/pricing", + config=config + ) + print("LLM-based extraction JSON:", result.extracted_content) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**What’s happening?** +- We define a Pydantic schema (`PricingInfo`) describing the fields we want. +- The LLM extraction strategy uses that schema and your instructions to transform raw text into structured JSON. +- Depending on the **provider** and **api_token**, you can use local models or a remote API. + +--- + +## 8. Next Steps + +Congratulations! You have: +1. Installed Crawl4AI (via pip, with Docker as an option). +2. Performed a simple crawl and printed Markdown. +3. Seen how adding a **markdown generator** + **content filter** can produce “fit” Markdown. +4. Experimented with **CSS-based** extraction for repetitive data. +5. Learned the basics of **LLM-based** extraction (open-source and closed-source). + +If you are ready for more, check out: + +- **Installation**: Learn more on how to install Crawl4AI and set up Playwright. +- **Focus on Configuration**: Learn to customize browser settings, caching modes, advanced timeouts, etc. +- **Markdown Generation Basics**: Dive deeper into content filtering and “fit markdown” usage. +- **Dynamic Pages & Hooks**: Tackle sites with “Load More” buttons, login forms, or JavaScript complexities. +- **Deployment**: Run Crawl4AI in Docker containers and scale across multiple nodes. +- **Explanations & How-To Guides**: Explore browser contexts, identity-based crawling, hooking, performance, and more. + +Crawl4AI is a powerful tool for extracting data and generating Markdown from virtually any website. Enjoy exploring, and we hope you build amazing AI-powered applications with it! diff --git a/docs/md_v3/tutorials/getting-warmer.md b/docs/md_v3/tutorials/getting-warmer.md new file mode 100644 index 0000000000000000000000000000000000000000..b2deb414328d934c87a2cc8e1d9eb054731d5291 --- /dev/null +++ b/docs/md_v3/tutorials/getting-warmer.md @@ -0,0 +1,527 @@ +# Crawl4AI Quick Start Guide: Your All-in-One AI-Ready Web Crawling & AI Integration Solution + +Crawl4AI, the **#1 trending GitHub repository**, streamlines web content extraction into AI-ready formats. Perfect for AI assistants, semantic search engines, or data pipelines, Crawl4AI transforms raw HTML into structured Markdown or JSON effortlessly. Integrate with LLMs, open-source models, or your own retrieval-augmented generation workflows. + +**What Crawl4AI is not:** + +Crawl4AI is not a replacement for traditional web scraping libraries, Selenium, or Playwright. It's not designed as a general-purpose web automation tool. Instead, Crawl4AI has a specific, focused goal: + +- To generate perfect, AI-friendly data (particularly for LLMs) from web content +- To maximize speed and efficiency in data extraction and processing +- To operate at scale, from Raspberry Pi to cloud infrastructures + +Crawl4AI is engineered with a "scale-first" mindset, aiming to handle millions of links while maintaining exceptional performance. It's super efficient and fast, optimized to: + +1. Transform raw web content into structured, LLM-ready formats (Markdown/JSON) +2. Implement intelligent extraction strategies to reduce reliance on costly API calls +3. Provide a streamlined pipeline for AI data preparation and ingestion + +In essence, Crawl4AI bridges the gap between web content and AI systems, focusing on delivering high-quality, processed data rather than offering broad web automation capabilities. + +**Key Links:** + +- **Website:** [https://crawl4ai.com](https://crawl4ai.com) +- **GitHub:** [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai) +- **Colab Notebook:** [Try on Google Colab](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing) +- **Quickstart Code Example:** [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) +- **Examples Folder:** [Crawl4AI Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) + +--- + +## Table of Contents + +- [Crawl4AI Quick Start Guide: Your All-in-One AI-Ready Web Crawling \& AI Integration Solution](#crawl4ai-quick-start-guide-your-all-in-one-ai-ready-web-crawling--ai-integration-solution) + - [Table of Contents](#table-of-contents) + - [1. Introduction \& Key Concepts](#1-introduction--key-concepts) + - [2. Installation \& Environment Setup](#2-installation--environment-setup) + - [Test Your Installation](#test-your-installation) + - [3. Core Concepts \& Configuration](#3-core-concepts--configuration) + - [4. Basic Crawling \& Simple Extraction](#4-basic-crawling--simple-extraction) + - [5. Markdown Generation \& AI-Optimized Output](#5-markdown-generation--ai-optimized-output) + - [6. Structured Data Extraction (CSS, XPath, LLM)](#6-structured-data-extraction-css-xpath-llm) + - [7. Advanced Extraction: LLM \& Open-Source Models](#7-advanced-extraction-llm--open-source-models) + - [8. Page Interactions, JS Execution, \& Dynamic Content](#8-page-interactions-js-execution--dynamic-content) + - [9. Media, Links, \& Metadata Handling](#9-media-links--metadata-handling) + - [10. Authentication \& Identity Preservation](#10-authentication--identity-preservation) + - [Manual Setup via User Data Directory](#manual-setup-via-user-data-directory) + - [Using `storage_state`](#using-storage_state) + - [11. Proxy \& Security Enhancements](#11-proxy--security-enhancements) + - [12. Screenshots, PDFs \& File Downloads](#12-screenshots-pdfs--file-downloads) + - [13. Caching \& Performance Optimization](#13-caching--performance-optimization) + - [14. Hooks for Custom Logic](#14-hooks-for-custom-logic) + - [15. Dockerization \& Scaling](#15-dockerization--scaling) + - [16. Troubleshooting \& Common Pitfalls](#16-troubleshooting--common-pitfalls) + - [17. Comprehensive End-to-End Example](#17-comprehensive-end-to-end-example) + - [18. Further Resources \& Community](#18-further-resources--community) + +--- + +## 1. Introduction & Key Concepts + +Crawl4AI transforms websites into structured, AI-friendly data. It efficiently handles large-scale crawling, integrates with both proprietary and open-source LLMs, and optimizes content for semantic search or RAG pipelines. + +**Quick Test:** + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def test_run(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com") + print(result.markdown) + +asyncio.run(test_run()) +``` + +If you see Markdown output, everything is working! + +**More info:** [See /docs/introduction](#) or [1_introduction.ex.md](https://github.com/unclecode/crawl4ai/blob/main/introduction.ex.md) + +--- + +## 2. Installation & Environment Setup + +```bash +# Install the package +pip install crawl4ai +crawl4ai-setup + +# Install Playwright with system dependencies (recommended) +playwright install --with-deps # Installs all browsers + +# Or install specific browsers: +playwright install --with-deps chrome # Recommended for Colab/Linux +playwright install --with-deps firefox +playwright install --with-deps webkit +playwright install --with-deps chromium + +# Keep Playwright updated periodically +playwright install +``` + +> **Note**: For Google Colab and some Linux environments, use `chrome` instead of `chromium` - it tends to work more reliably. + +### Test Your Installation +Try these one-liners: + +```python +# Visible browser test +python -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); browser = p.chromium.launch(headless=False); page = browser.new_page(); page.goto('https://example.com'); input('Press Enter to close...')" + +# Headless test (for servers/CI) +python -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); browser = p.chromium.launch(headless=True); page = browser.new_page(); page.goto('https://example.com'); print(f'Title: {page.title()}'); browser.close()" +``` + +You should see a browser window (in visible test) loading example.com. If you get errors, try with Firefox using `playwright install --with-deps firefox`. + + +**Try in Colab:** +[Open Colab Notebook](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing) + +**More info:** [See /docs/configuration](#) or [2_configuration.md](https://github.com/unclecode/crawl4ai/blob/main/configuration.md) + +--- + +## 3. Core Concepts & Configuration + +Use `AsyncWebCrawler`, `CrawlerRunConfig`, and `BrowserConfig` to control crawling. + +**Example config:** + +```python +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig + +browser_config = BrowserConfig( + headless=True, + verbose=True, + viewport_width=1080, + viewport_height=600, + text_mode=False, + ignore_https_errors=True, + java_script_enabled=True +) + +run_config = CrawlerRunConfig( + css_selector="article.main", + word_count_threshold=50, + excluded_tags=['nav','footer'], + exclude_external_links=True, + wait_for="css:.article-loaded", + page_timeout=60000, + delay_before_return_html=1.0, + mean_delay=0.1, + max_range=0.3, + process_iframes=True, + remove_overlay_elements=True, + js_code=""" + (async () => { + window.scrollTo(0, document.body.scrollHeight); + await new Promise(r => setTimeout(r, 2000)); + document.querySelector('.load-more')?.click(); + })(); + """ +) + +# Use: ENABLED, DISABLED, BYPASS, READ_ONLY, WRITE_ONLY +# run_config.cache_mode = CacheMode.ENABLED +``` + +**Prefixes:** + +- `http://` or `https://` for live pages +- `file://local.html` for local +- `raw:` for raw HTML strings + +**More info:** [See /docs/async_webcrawler](#) or [3_async_webcrawler.ex.md](https://github.com/unclecode/crawl4ai/blob/main/async_webcrawler.ex.md) + +--- + +## 4. Basic Crawling & Simple Extraction + +```python +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://news.example.com/article", config=run_config) + print(result.markdown) # Basic markdown content +``` + +**More info:** [See /docs/browser_context_page](#) or [4_browser_context_page.ex.md](https://github.com/unclecode/crawl4ai/blob/main/browser_context_page.ex.md) + +--- + +## 5. Markdown Generation & AI-Optimized Output + +After crawling, `result.markdown_v2` provides: + +- `raw_markdown`: Unfiltered markdown +- `markdown_with_citations`: Links as references at the bottom +- `references_markdown`: A separate list of reference links +- `fit_markdown`: Filtered, relevant markdown (e.g., after BM25) +- `fit_html`: The HTML used to produce `fit_markdown` + +**Example:** + +```python +print("RAW:", result.markdown_v2.raw_markdown[:200]) +print("CITED:", result.markdown_v2.markdown_with_citations[:200]) +print("REFERENCES:", result.markdown_v2.references_markdown) +print("FIT MARKDOWN:", result.markdown_v2.fit_markdown) +``` + +For AI training, `fit_markdown` focuses on the most relevant content. + +**More info:** [See /docs/markdown_generation](#) or [5_markdown_generation.ex.md](https://github.com/unclecode/crawl4ai/blob/main/markdown_generation.ex.md) + +--- + +## 6. Structured Data Extraction (CSS, XPath, LLM) + +Extract JSON data without LLMs: + +**CSS:** + +```python +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +schema = { + "name": "Products", + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "price", "selector": ".price", "type": "text"} + ] +} +run_config.extraction_strategy = JsonCssExtractionStrategy(schema) +``` + +**XPath:** + +```python +from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy + +xpath_schema = { + "name": "Articles", + "baseSelector": "//div[@class='article']", + "fields": [ + {"name":"headline","selector":".//h1","type":"text"}, + {"name":"summary","selector":".//p[@class='summary']","type":"text"} + ] +} +run_config.extraction_strategy = JsonXPathExtractionStrategy(xpath_schema) +``` + +**More info:** [See /docs/extraction_strategies](#) or [7_extraction_strategies.ex.md](https://github.com/unclecode/crawl4ai/blob/main/extraction_strategies.ex.md) + +--- + +## 7. Advanced Extraction: LLM & Open-Source Models + +Use LLMExtractionStrategy for complex tasks. Works with OpenAI or open-source models (e.g., Ollama). + +```python +from pydantic import BaseModel +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +class TravelData(BaseModel): + destination: str + attractions: list + +run_config.extraction_strategy = LLMExtractionStrategy( + provider="ollama/nemotron", + schema=TravelData.schema(), + instruction="Extract destination and top attractions." +) +``` + +**More info:** [See /docs/extraction_strategies](#) or [7_extraction_strategies.ex.md](https://github.com/unclecode/crawl4ai/blob/main/extraction_strategies.ex.md) + +--- + +## 8. Page Interactions, JS Execution, & Dynamic Content + +Insert `js_code` and use `wait_for` to ensure content loads. Example: + +```python +run_config.js_code = """ +(async () => { + document.querySelector('.load-more')?.click(); + await new Promise(r => setTimeout(r, 2000)); +})(); +""" +run_config.wait_for = "css:.item-loaded" +``` + +**More info:** [See /docs/page_interaction](#) or [11_page_interaction.md](https://github.com/unclecode/crawl4ai/blob/main/page_interaction.md) + +--- + +## 9. Media, Links, & Metadata Handling + +`result.media["images"]`: List of images with `src`, `score`, `alt`. Score indicates relevance. + +`result.media["videos"]`, `result.media["audios"]` similarly hold media info. + +`result.links["internal"]`, `result.links["external"]`, `result.links["social"]`: Categorized links. Each link has `href`, `text`, `context`, `type`. + +`result.metadata`: Title, description, keywords, author. + +**Example:** + +```python +# Images +for img in result.media["images"]: + print("Image:", img["src"], "Score:", img["score"], "Alt:", img.get("alt","N/A")) + +# Links +for link in result.links["external"]: + print("External Link:", link["href"], "Text:", link["text"]) + +# Metadata +print("Page Title:", result.metadata["title"]) +print("Description:", result.metadata["description"]) +``` + +**More info:** [See /docs/content_selection](#) or [8_content_selection.ex.md](https://github.com/unclecode/crawl4ai/blob/main/content_selection.ex.md) + +--- + +## 10. Authentication & Identity Preservation + +### Manual Setup via User Data Directory + +1. **Open Chrome with a custom user data dir:** + + ```bash + "C:\Program Files\Google\Chrome\Application\chrome.exe" --user-data-dir="C:\MyChromeProfile" + ``` + + On macOS: + + ```bash + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --user-data-dir="/Users/username/ChromeProfiles/MyProfile" + ``` + +2. **Log in to sites, solve CAPTCHAs, adjust settings manually.** + The browser saves cookies/localStorage in that directory. + +3. **Use `user_data_dir` in `BrowserConfig`:** + + ```python + browser_config = BrowserConfig( + headless=True, + user_data_dir="/Users/username/ChromeProfiles/MyProfile" + ) + ``` + + Now the crawler starts with those cookies, sessions, etc. + +### Using `storage_state` + +Alternatively, export and reuse storage states: + +```python +browser_config = BrowserConfig( + headless=True, + storage_state="mystate.json" # Pre-saved state +) +``` + +No repeated logins needed. + +**More info:** [See /docs/storage_state](#) or [16_storage_state.md](https://github.com/unclecode/crawl4ai/blob/main/storage_state.md) + +--- + +## 11. Proxy & Security Enhancements + +Use `proxy_config` for authenticated proxies: + +```python +browser_config.proxy_config = { + "server": "http://proxy.example.com:8080", + "username": "proxyuser", + "password": "proxypass" +} +``` + +Combine with `headers` or `ignore_https_errors` as needed. + +**More info:** [See /docs/proxy_security](#) or [14_proxy_security.md](https://github.com/unclecode/crawl4ai/blob/main/proxy_security.md) + +--- + +## 12. Screenshots, PDFs & File Downloads + +Enable `screenshot=True` or `pdf=True` in `CrawlerRunConfig`: + +```python +run_config.screenshot = True +run_config.pdf = True +``` + +After crawling: + +```python +if result.screenshot: + with open("page.png", "wb") as f: + f.write(result.screenshot) + +if result.pdf: + with open("page.pdf", "wb") as f: + f.write(result.pdf) +``` + +**File Downloads:** + +```python +browser_config.accept_downloads = True +browser_config.downloads_path = "./downloads" +run_config.js_code = """document.querySelector('a.download')?.click();""" + +# After crawl: +print("Downloaded files:", result.downloaded_files) +``` + +**More info:** [See /docs/screenshot_and_pdf_export](#) or [15_screenshot_and_pdf_export.md](https://github.com/unclecode/crawl4ai/blob/main/screenshot_and_pdf_export.md) +Also [10_file_download.md](https://github.com/unclecode/crawl4ai/blob/main/file_download.md) + +--- + +## 13. Caching & Performance Optimization + +Set `cache_mode` to reuse fetch results: + +```python +from crawl4ai import CacheMode +run_config.cache_mode = CacheMode.ENABLED +``` + +Adjust delays, increase concurrency, or use `text_mode=True` for faster extraction. + +**More info:** [See /docs/cache_modes](#) or [9_cache_modes.md](https://github.com/unclecode/crawl4ai/blob/main/cache_modes.md) + +--- + +## 14. Hooks for Custom Logic + +Hooks let you run code at specific lifecycle events without creating pages manually in `on_browser_created`. + +Use `on_page_context_created` to apply routing or modify page contexts before crawling the URL: + +**Example Hook:** + +```python +async def on_page_context_created_hook(context, page, **kwargs): + # Block all images to speed up load + await context.route("**/*.{png,jpg,jpeg}", lambda route: route.abort()) + print("[HOOK] Image requests blocked") + +async with AsyncWebCrawler(config=browser_config) as crawler: + crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created_hook) + result = await crawler.arun("https://imageheavy.example.com", config=run_config) + print("Crawl finished with images blocked.") +``` + +This hook is clean and doesn’t create a separate page itself—it just modifies the current context/page setup. + +**More info:** [See /docs/hooks_auth](#) or [13_hooks_auth.md](https://github.com/unclecode/crawl4ai/blob/main/hooks_auth.md) + +--- + +## 15. Dockerization & Scaling + +Use Docker images: + +- AMD64 basic: + +```bash +docker pull unclecode/crawl4ai:basic-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:basic-amd64 +``` + +- ARM64 for M1/M2: + +```bash +docker pull unclecode/crawl4ai:basic-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:basic-arm64 +``` + +- GPU support: + +```bash +docker pull unclecode/crawl4ai:gpu-amd64 +docker run --gpus all -p 11235:11235 unclecode/crawl4ai:gpu-amd64 +``` + +Scale with load balancers or Kubernetes. + +**More info:** [See /docs/proxy_security (for proxy) or relevant Docker instructions in README](#) + +--- + +## 16. Troubleshooting & Common Pitfalls + +- Empty results? Relax filters, check selectors. +- Timeouts? Increase `page_timeout` or refine `wait_for`. +- CAPTCHAs? Use `user_data_dir` or `storage_state` after manual solving. +- JS errors? Try headful mode for debugging. + +Check [examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) & [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) for more code. + +--- + +## 17. Comprehensive End-to-End Example + +Combine hooks, JS execution, PDF saving, LLM extraction—see [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) for a full example. + +--- + +## 18. Further Resources & Community + +- **Docs:** [https://crawl4ai.com](https://crawl4ai.com) +- **Issues & PRs:** [https://github.com/unclecode/crawl4ai/issues](https://github.com/unclecode/crawl4ai/issues) + +Follow [@unclecode](https://x.com/unclecode) for news & community updates. + +**Happy Crawling!** +Leverage Crawl4AI to feed your AI models with clean, structured web data today. diff --git a/docs/md_v3/tutorials/hooks-custom.md b/docs/md_v3/tutorials/hooks-custom.md new file mode 100644 index 0000000000000000000000000000000000000000..2f1440652156987473ffedb48f1139f17475e643 --- /dev/null +++ b/docs/md_v3/tutorials/hooks-custom.md @@ -0,0 +1,335 @@ +# Hooks & Custom Code + +Crawl4AI supports a **hook** system that lets you run your own Python code at specific points in the crawling pipeline. By injecting logic into these hooks, you can automate tasks like: + +- **Authentication** (log in before navigating) +- **Content manipulation** (modify HTML, inject scripts, etc.) +- **Session or browser configuration** (e.g., adjusting user agents, local storage) +- **Custom data collection** (scrape extra details or track state at each stage) + +In this tutorial, you’ll learn about: + +1. What hooks are available +2. How to attach code to each hook +3. Practical examples (auth flows, user agent changes, content manipulation, etc.) + +> **Prerequisites** +> - Familiar with [AsyncWebCrawler Basics](./async-webcrawler-basics.md). +> - Comfortable with Python async/await. + +--- + +## 1. Overview of Available Hooks + +| Hook Name | Called When / Purpose | Context / Objects Provided | +|--------------------------|-----------------------------------------------------------------|-----------------------------------------------------| +| **`on_browser_created`** | Immediately after the browser is launched, but **before** any page or context is created. | **Browser** object only (no `page` yet). Use it for broad browser-level config. | +| **`on_page_context_created`** | Right after a new page context is created. Perfect for setting default timeouts, injecting scripts, etc. | Typically provides `page` and `context`. | +| **`on_user_agent_updated`** | Whenever the user agent changes. For advanced user agent logic or additional header updates. | Typically provides `page` and updated user agent string. | +| **`on_execution_started`** | Right before your main crawling logic runs (before rendering the page). Good for one-time setup or variable initialization. | Typically provides `page`, possibly `context`. | +| **`before_goto`** | Right before navigating to the URL (i.e., `page.goto(...)`). Great for setting cookies, altering the URL, or hooking in authentication steps. | Typically provides `page`, `context`, and `goto_params`. | +| **`after_goto`** | Immediately after navigation completes, but before scraping. For post-login checks or initial content adjustments. | Typically provides `page`, `context`, `response`. | +| **`before_retrieve_html`** | Right before retrieving or finalizing the page’s HTML content. Good for in-page manipulation (e.g., removing ads or disclaimers). | Typically provides `page` or final HTML reference. | +| **`before_return_html`** | Just before the HTML is returned to the crawler pipeline. Last chance to alter or sanitize content. | Typically provides final HTML or a `page`. | + +### A Note on `on_browser_created` (the “unbrowser” hook) +- **No `page`** object is available because no page context exists yet. You can, however, set up browser-wide properties. +- For example, you might control [CDP sessions][cdp] or advanced browser flags here. + +--- + +## 2. Registering Hooks + +You can attach hooks by calling: + +```python +crawler.crawler_strategy.set_hook("hook_name", your_hook_function) +``` + +or by passing a `hooks` dictionary to `AsyncWebCrawler` or your strategy constructor: + +```python +hooks = { + "before_goto": my_before_goto_hook, + "after_goto": my_after_goto_hook, + # ... etc. +} +async with AsyncWebCrawler(hooks=hooks) as crawler: + ... +``` + +### Hook Signature + +Each hook is a function (async or sync, depending on your usage) that receives **certain parameters**—most often `page`, `context`, or custom arguments relevant to that stage. The library then awaits or calls your hook before continuing. + +--- + +## 3. Real-Life Examples + +Below are concrete scenarios where hooks come in handy. + +--- + +### 3.1 Authentication Before Navigation + +One of the most frequent tasks is logging in or applying authentication **before** the crawler navigates to a URL (so that the user is recognized immediately). + +#### Using `before_goto` + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def before_goto_auth_hook(page, context, goto_params, **kwargs): + """ + Example: Set cookies or localStorage to simulate login. + This hook runs right before page.goto() is called. + """ + # Example: Insert cookie-based auth or local storage data + # (You could also do more complex actions, like fill forms if you already have a 'page' open.) + print("[HOOK] Setting auth data before goto.") + await context.add_cookies([ + { + "name": "session", + "value": "abcd1234", + "domain": "example.com", + "path": "/" + } + ]) + # Optionally manipulate goto_params if needed: + # goto_params["url"] = goto_params["url"] + "?debug=1" + +async def main(): + hooks = { + "before_goto": before_goto_auth_hook + } + + browser_cfg = BrowserConfig(headless=True) + crawler_cfg = CrawlerRunConfig() + + async with AsyncWebCrawler(config=browser_cfg, hooks=hooks) as crawler: + result = await crawler.arun(url="https://example.com/protected", config=crawler_cfg) + if result.success: + print("[OK] Logged in and fetched protected page.") + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key Points** +- `before_goto` receives `page`, `context`, `goto_params` so you can add cookies, localStorage, or even change the URL itself. +- If you need to run a real login flow (submitting forms), consider `on_browser_created` or `on_page_context_created` if you want to do it once at the start. + +--- + +### 3.2 Setting Up the Browser in `on_browser_created` + +If you need to do advanced browser-level configuration (e.g., hooking into the Chrome DevTools Protocol, adjusting command-line flags, etc.), you’ll use `on_browser_created`. No `page` is available yet, but you can set up the **browser** instance itself. + +```python +async def on_browser_created_hook(browser, **kwargs): + """ + Runs immediately after the browser is created, before any pages. + 'browser' here is a Playwright Browser object. + """ + print("[HOOK] Browser created. Setting up custom stuff.") + # Possibly connect to DevTools or create an incognito context + # Example (pseudo-code): + # devtools_url = await browser.new_context(devtools=True) + +# Usage: +async with AsyncWebCrawler(hooks={"on_browser_created": on_browser_created_hook}) as crawler: + ... +``` + +--- + +### 3.3 Adjusting Page or Context in `on_page_context_created` + +If you’d like to set default timeouts or inject scripts right after a page context is spun up: + +```python +async def on_page_context_created_hook(page, context, **kwargs): + print("[HOOK] Page context created. Setting default timeouts or scripts.") + await page.set_default_timeout(20000) # 20 seconds + # Possibly inject a script or set user locale + +# Usage: +hooks = { + "on_page_context_created": on_page_context_created_hook +} +``` + +--- + +### 3.4 Dynamically Updating User Agents + +`on_user_agent_updated` is fired whenever the strategy updates the user agent. For instance, you might want to set certain cookies or console-log changes for debugging: + +```python +async def on_user_agent_updated_hook(page, context, new_ua, **kwargs): + print(f"[HOOK] User agent updated to {new_ua}") + # Maybe add a custom header based on new UA + await context.set_extra_http_headers({"X-UA-Source": new_ua}) + +hooks = { + "on_user_agent_updated": on_user_agent_updated_hook +} +``` + +--- + +### 3.5 Initializing Stuff with `on_execution_started` + +`on_execution_started` runs before your main crawling logic. It’s a good place for short, one-time setup tasks (like clearing old caches, or storing a timestamp). + +```python +async def on_execution_started_hook(page, context, **kwargs): + print("[HOOK] Execution started. Setting a start timestamp or logging.") + context.set_default_navigation_timeout(45000) # 45s if your site is slow + +hooks = { + "on_execution_started": on_execution_started_hook +} +``` + +--- + +### 3.6 Post-Processing with `after_goto` + +After the crawler finishes navigating (i.e., the page has presumably loaded), you can do additional checks or manipulations—like verifying you’re on the right page, or removing interstitials: + +```python +async def after_goto_hook(page, context, response, **kwargs): + """ + Called right after page.goto() finishes, but before the crawler extracts HTML. + """ + if response and response.ok: + print("[HOOK] After goto. Status:", response.status) + # Maybe remove popups or check if we landed on a login failure page. + await page.evaluate("""() => { + const popup = document.querySelector(".annoying-popup"); + if (popup) popup.remove(); + }""") + else: + print("[HOOK] Navigation might have failed, status not ok or no response.") + +hooks = { + "after_goto": after_goto_hook +} +``` + +--- + +### 3.7 Last-Minute Modifications in `before_retrieve_html` or `before_return_html` + +Sometimes you need to tweak the page or raw HTML right before it’s captured. + +```python +async def before_retrieve_html_hook(page, context, **kwargs): + """ + Modify the DOM just before the crawler finalizes the HTML. + """ + print("[HOOK] Removing adverts before capturing HTML.") + await page.evaluate("""() => { + const ads = document.querySelectorAll(".ad-banner"); + ads.forEach(ad => ad.remove()); + }""") + +async def before_return_html_hook(page, context, html, **kwargs): + """ + 'html' is the near-finished HTML string. Return an updated string if you like. + """ + # For example, remove personal data or certain tags from the final text + print("[HOOK] Sanitizing final HTML.") + sanitized_html = html.replace("PersonalInfo:", "[REDACTED]") + return sanitized_html + +hooks = { + "before_retrieve_html": before_retrieve_html_hook, + "before_return_html": before_return_html_hook +} +``` + +**Note**: If you want to make last-second changes in `before_return_html`, you can manipulate the `html` string directly. Return a new string if you want to override. + +--- + +## 4. Putting It All Together + +You can combine multiple hooks in a single run. For instance: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def on_browser_created_hook(browser, **kwargs): + print("[HOOK] Browser is up, no page yet. Good for broad config.") + +async def before_goto_auth_hook(page, context, goto_params, **kwargs): + print("[HOOK] Adding cookies for auth.") + await context.add_cookies([{"name": "session", "value": "abcd1234", "domain": "example.com"}]) + +async def after_goto_log_hook(page, context, response, **kwargs): + if response: + print("[HOOK] after_goto: Status code:", response.status) + +async def main(): + hooks = { + "on_browser_created": on_browser_created_hook, + "before_goto": before_goto_auth_hook, + "after_goto": after_goto_log_hook + } + + browser_cfg = BrowserConfig(headless=True) + crawler_cfg = CrawlerRunConfig(verbose=True) + + async with AsyncWebCrawler(config=browser_cfg, hooks=hooks) as crawler: + result = await crawler.arun("https://example.com/protected", config=crawler_cfg) + if result.success: + print("[OK] Protected page length:", len(result.html)) + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +This example: + +1. **`on_browser_created`** sets up the brand-new browser instance. +2. **`before_goto`** ensures you inject an auth cookie before accessing the page. +3. **`after_goto`** logs the resulting HTTP status code. + +--- + +## 5. Common Pitfalls & Best Practices + +1. **Hook Order**: If multiple hooks do overlapping tasks (e.g., two `before_goto` hooks), be mindful of conflicts or repeated logic. +2. **Async vs Sync**: Some hooks might be used in a synchronous or asynchronous style. Confirm your function signature. If the crawler expects `async`, define `async def`. +3. **Mutating goto_params**: `goto_params` is a dict that eventually goes to Playwright’s `page.goto()`. Changing the `url` or adding extra fields can be powerful but can also lead to confusion. Document your changes carefully. +4. **Browser vs Page vs Context**: Not all hooks have both `page` and `context`. For example, `on_browser_created` only has access to **`browser`**. +5. **Avoid Overdoing It**: Hooks are powerful but can lead to complexity. If you find yourself writing massive code inside a hook, consider if a separate “how-to” function with a simpler approach might suffice. + +--- + +## Conclusion & Next Steps + +**Hooks** let you bend Crawl4AI to your will: + +- **Authentication** (cookies, localStorage) with `before_goto` +- **Browser-level config** with `on_browser_created` +- **Page or context config** with `on_page_context_created` +- **Content modifications** before capturing HTML (`before_retrieve_html` or `before_return_html`) + +**Where to go next**: + +- **[Identity-Based Crawling & Anti-Bot](./identity-anti-bot.md)**: Combine hooks with advanced user simulation to avoid bot detection. +- **[Reference → AsyncPlaywrightCrawlerStrategy](../../reference/browser-strategies.md)**: Learn more about how hooks are implemented under the hood. +- **[How-To Guides](../../how-to/)**: Check short, specific recipes for tasks like scraping multiple pages with repeated “Load More” clicks. + +With the hook system, you have near-complete control over the browser’s lifecycle—whether it’s setting up environment variables, customizing user agents, or manipulating the HTML. Enjoy the freedom to create sophisticated, fully customized crawling pipelines! + +**Last Updated**: 2024-XX-XX diff --git a/docs/md_v3/tutorials/json-extraction-basic.md b/docs/md_v3/tutorials/json-extraction-basic.md new file mode 100644 index 0000000000000000000000000000000000000000..1a9b79e608737a675d5f09844eb254130fadb81c --- /dev/null +++ b/docs/md_v3/tutorials/json-extraction-basic.md @@ -0,0 +1,395 @@ +# Extracting JSON (No LLM) + +One of Crawl4AI’s **most powerful** features is extracting **structured JSON** from websites **without** relying on large language models. By defining a **schema** with CSS or XPath selectors, you can extract data instantly—even from complex or nested HTML structures—without the cost, latency, or environmental impact of an LLM. + +**Why avoid LLM for basic extractions?** + +1. **Faster & Cheaper**: No API calls or GPU overhead. +2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. A well-defined schema is practically carbon-free. +3. **Precise & Repeatable**: CSS/XPath selectors do exactly what you specify. LLM outputs can vary or hallucinate. +4. **Scales Readily**: For thousands of pages, schema-based extraction runs quickly and in parallel. + +Below, we’ll explore how to craft these schemas and use them with **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy** if you prefer XPath). We’ll also highlight advanced features like **nested fields** and **base element attributes**. + +--- + +## 1. Intro to Schema-Based Extraction + +A schema defines: + +1. A **base selector** that identifies each “container” element on the page (e.g., a product row, a blog post card). +2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.). +3. **Nested** or **list** types for repeated or hierarchical structures. + +For example, if you have a list of products, each one might have a name, price, reviews, and “related products.” This approach is faster and more reliable than an LLM for consistent, structured pages. + +--- + +## 2. Simple Example: Crypto Prices + +Let’s begin with a **simple** schema-based extraction using the `JsonCssExtractionStrategy`. Below is a snippet that extracts cryptocurrency prices from a site (similar to the legacy Coinbase example). Notice we **don’t** call any LLM: + +```python +import json +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +async def extract_crypto_prices(): + # 1. Define a simple extraction schema + schema = { + "name": "Crypto Prices", + "baseSelector": "div.crypto-row", # Repeated elements + "fields": [ + { + "name": "coin_name", + "selector": "h2.coin-name", + "type": "text" + }, + { + "name": "price", + "selector": "span.coin-price", + "type": "text" + } + ] + } + + # 2. Create the extraction strategy + extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) + + # 3. Set up your crawler config (if needed) + config = CrawlerRunConfig( + # e.g., pass js_code or wait_for if the page is dynamic + # wait_for="css:.crypto-row:nth-child(20)" + cache_mode = CacheMode.BYPASS, + extraction_strategy=extraction_strategy, + ) + + async with AsyncWebCrawler(verbose=True) as crawler: + # 4. Run the crawl and extraction + result = await crawler.arun( + url="https://example.com/crypto-prices", + + config=config + ) + + if not result.success: + print("Crawl failed:", result.error_message) + return + + # 5. Parse the extracted JSON + data = json.loads(result.extracted_content) + print(f"Extracted {len(data)} coin entries") + print(json.dumps(data[0], indent=2) if data else "No data found") + +asyncio.run(extract_crypto_prices()) +``` + +**Highlights**: + +- **`baseSelector`**: Tells us where each “item” (crypto row) is. +- **`fields`**: Two fields (`coin_name`, `price`) using simple CSS selectors. +- Each field defines a **`type`** (e.g., `text`, `attribute`, `html`, `regex`, etc.). + +No LLM is needed, and the performance is **near-instant** for hundreds or thousands of items. + +--- + +### **XPath Example with `raw://` HTML** + +Below is a short example demonstrating **XPath** extraction plus the **`raw://`** scheme. We’ll pass a **dummy HTML** directly (no network request) and define the extraction strategy in `CrawlerRunConfig`. + +```python +import json +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy + +async def extract_crypto_prices_xpath(): + # 1. Minimal dummy HTML with some repeating rows + dummy_html = """ + + +
    +

    Bitcoin

    + $28,000 +
    +
    +

    Ethereum

    + $1,800 +
    + + + """ + + # 2. Define the JSON schema (XPath version) + schema = { + "name": "Crypto Prices via XPath", + "baseSelector": "//div[@class='crypto-row']", + "fields": [ + { + "name": "coin_name", + "selector": ".//h2[@class='coin-name']", + "type": "text" + }, + { + "name": "price", + "selector": ".//span[@class='coin-price']", + "type": "text" + } + ] + } + + # 3. Place the strategy in the CrawlerRunConfig + config = CrawlerRunConfig( + extraction_strategy=JsonXPathExtractionStrategy(schema, verbose=True) + ) + + # 4. Use raw:// scheme to pass dummy_html directly + raw_url = f"raw://{dummy_html}" + + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url=raw_url, + config=config + ) + + if not result.success: + print("Crawl failed:", result.error_message) + return + + data = json.loads(result.extracted_content) + print(f"Extracted {len(data)} coin rows") + if data: + print("First item:", data[0]) + +asyncio.run(extract_crypto_prices_xpath()) +``` + +**Key Points**: + +1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`. +2. **`baseSelector`** and each field’s `"selector"` use **XPath** instead of CSS. +3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing. +4. Everything (including the extraction strategy) is in **`CrawlerRunConfig`**. + +That’s how you keep the config self-contained, illustrate **XPath** usage, and demonstrate the **raw** scheme for direct HTML input—all while avoiding the old approach of passing `extraction_strategy` directly to `arun()`. + +--- + +## 3. Advanced Schema & Nested Structures + +Real sites often have **nested** or repeated data—like categories containing products, which themselves have a list of reviews or features. For that, we can define **nested** or **list** (and even **nested_list**) fields. + +### Sample E-Commerce HTML + +We have a **sample e-commerce** HTML file on GitHub (example): +``` +https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html +``` +This snippet includes categories, products, features, reviews, and related items. Let’s see how to define a schema that fully captures that structure **without LLM**. + +```python +schema = { + "name": "E-commerce Product Catalog", + "baseSelector": "div.category", + # (1) We can define optional baseFields if we want to extract attributes from the category container + "baseFields": [ + {"name": "data_cat_id", "type": "attribute", "attribute": "data-cat-id"}, + ], + "fields": [ + { + "name": "category_name", + "selector": "h2.category-name", + "type": "text" + }, + { + "name": "products", + "selector": "div.product", + "type": "nested_list", # repeated sub-objects + "fields": [ + { + "name": "name", + "selector": "h3.product-name", + "type": "text" + }, + { + "name": "price", + "selector": "p.product-price", + "type": "text" + }, + { + "name": "details", + "selector": "div.product-details", + "type": "nested", # single sub-object + "fields": [ + {"name": "brand", "selector": "span.brand", "type": "text"}, + {"name": "model", "selector": "span.model", "type": "text"} + ] + }, + { + "name": "features", + "selector": "ul.product-features li", + "type": "list", + "fields": [ + {"name": "feature", "type": "text"} + ] + }, + { + "name": "reviews", + "selector": "div.review", + "type": "nested_list", + "fields": [ + {"name": "reviewer", "selector": "span.reviewer", "type": "text"}, + {"name": "rating", "selector": "span.rating", "type": "text"}, + {"name": "comment", "selector": "p.review-text", "type": "text"} + ] + }, + { + "name": "related_products", + "selector": "ul.related-products li", + "type": "list", + "fields": [ + {"name": "name", "selector": "span.related-name", "type": "text"}, + {"name": "price", "selector": "span.related-price", "type": "text"} + ] + } + ] + } + ] +} +``` + +Key Takeaways: + +- **Nested vs. List**: + - **`type: "nested"`** means a **single** sub-object (like `details`). + - **`type: "list"`** means multiple items that are **simple** dictionaries or single text fields. + - **`type: "nested_list"`** means repeated **complex** objects (like `products` or `reviews`). +- **Base Fields**: We can extract **attributes** from the container element via `"baseFields"`. For instance, `"data_cat_id"` might be `data-cat-id="elect123"`. +- **Transforms**: We can also define a `transform` if we want to lower/upper case, strip whitespace, or even run a custom function. + +### Running the Extraction + +```python +import json +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +ecommerce_schema = { + # ... the advanced schema from above ... +} + +async def extract_ecommerce_data(): + strategy = JsonCssExtractionStrategy(ecommerce_schema, verbose=True) + + config = CrawlerRunConfig() + + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html", + extraction_strategy=strategy, + config=config + ) + + if not result.success: + print("Crawl failed:", result.error_message) + return + + # Parse the JSON output + data = json.loads(result.extracted_content) + print(json.dumps(data, indent=2) if data else "No data found.") + +asyncio.run(extract_ecommerce_data()) +``` + +If all goes well, you get a **structured** JSON array with each “category,” containing an array of `products`. Each product includes `details`, `features`, `reviews`, etc. All of that **without** an LLM. + +--- + +## 4. Why “No LLM” Is Often Better + +1. **Zero Hallucination**: Schema-based extraction doesn’t guess text. It either finds it or not. +2. **Guaranteed Structure**: The same schema yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys. +3. **Speed**: LLM-based extraction can be 10–1000x slower for large-scale crawling. +4. **Scalable**: Adding or updating a field is a matter of adjusting the schema, not re-tuning a model. + +**When might you consider an LLM?** Possibly if the site is extremely unstructured or you want AI summarization. But always try a schema approach first for repeated or consistent data patterns. + +--- + +## 5. Base Element Attributes & Additional Fields + +It’s easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from your base or nested elements using: + +```json +{ + "name": "href", + "type": "attribute", + "attribute": "href", + "default": null +} +``` + +You can define them in **`baseFields`** (extracted from the main container element) or in each field’s sub-lists. This is especially helpful if you need an item’s link or ID stored in the parent `
    `. + +--- + +## 6. Putting It All Together: Larger Example + +Consider a blog site. We have a schema that extracts the **URL** from each post card (via `baseFields` with an `"attribute": "href"`), plus the title, date, summary, and author: + +```python +schema = { + "name": "Blog Posts", + "baseSelector": "a.blog-post-card", + "baseFields": [ + {"name": "post_url", "type": "attribute", "attribute": "href"} + ], + "fields": [ + {"name": "title", "selector": "h2.post-title", "type": "text", "default": "No Title"}, + {"name": "date", "selector": "time.post-date", "type": "text", "default": ""}, + {"name": "summary", "selector": "p.post-summary", "type": "text", "default": ""}, + {"name": "author", "selector": "span.post-author", "type": "text", "default": ""} + ] +} +``` + +Then run with `JsonCssExtractionStrategy(schema)` to get an array of blog post objects, each with `"post_url"`, `"title"`, `"date"`, `"summary"`, `"author"`. + +--- + +## 7. Tips & Best Practices + +1. **Inspect the DOM** in Chrome DevTools or Firefox’s Inspector to find stable selectors. +2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists. +3. **Test** your schema on partial HTML or a test page before a big crawl. +4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`. +5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, it’ll often show warnings. +6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the “parent” item. +7. **Performance**: For large pages, make sure your selectors are as narrow as possible. + +--- + +## 8. Conclusion + +With **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy**), you can build powerful, **LLM-free** pipelines that: + +- Scrape any consistent site for structured data. +- Support nested objects, repeating lists, or advanced transformations. +- Scale to thousands of pages quickly and reliably. + +**Next Steps**: + +- Explore the [Advanced Usage of JSON Extraction](../../explanations/extraction-chunking.md) for deeper details on schema nesting, transformations, or hooking. +- Combine your extracted JSON with advanced filtering or summarization in a second pass if needed. +- For dynamic pages, combine strategies with `js_code` or infinite scroll hooking to ensure all content is loaded. + +**Remember**: For repeated, structured data, you don’t need to pay for or wait on an LLM. A well-crafted schema plus CSS or XPath gets you the data faster, cleaner, and cheaper—**the real power** of Crawl4AI. + +**Last Updated**: 2024-XX-XX + +--- + +That’s it for **Extracting JSON (No LLM)**! You’ve seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines! \ No newline at end of file diff --git a/docs/md_v3/tutorials/json-extraction-llm.md b/docs/md_v3/tutorials/json-extraction-llm.md new file mode 100644 index 0000000000000000000000000000000000000000..5b9369d9f007f3f66e9a18cffdf76b32ecc2d46c --- /dev/null +++ b/docs/md_v3/tutorials/json-extraction-llm.md @@ -0,0 +1,334 @@ +Below is a **draft** of the **Extracting JSON (LLM)** tutorial, illustrating how to use large language models for structured data extraction in Crawl4AI. It highlights key parameters (like chunking, overlap, instruction, schema) and explains how the system remains **provider-agnostic** via LightLLM. Adjust field names or code snippets to match your repository’s specifics. + +--- + +# Extracting JSON (LLM) + +In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that: + +1. Works with **any** large language model supported by [LightLLM](https://github.com/LightLLM) (Ollama, OpenAI, Claude, and more). +2. Automatically splits content into chunks (if desired) to handle token limits, then combines results. +3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach. + +**Important**: LLM-based extraction can be slower and costlier than schema-based approaches. If your page data is highly structured, consider using [`JsonCssExtractionStrategy`](./json-extraction-basic.md) or [`JsonXPathExtractionStrategy`](./json-extraction-basic.md) first. But if you need AI to interpret or reorganize content, read on! + +--- + +## 1. Why Use an LLM? + +- **Complex Reasoning**: If the site’s data is unstructured, scattered, or full of natural language context. +- **Semantic Extraction**: Summaries, knowledge graphs, or relational data that require comprehension. +- **Flexible**: You can pass instructions to the model to do more advanced transformations or classification. + +--- + +## 2. Provider-Agnostic via LightLLM + +Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide: + +- **`provider`**: The `/` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.). +- **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it. +- **`api_base`** (optional): If your provider has a custom endpoint. + +This means you **aren’t locked** into a single LLM vendor. Switch or experiment easily. + +--- + +## 3. How LLM Extraction Works + +### 3.1 Flow + +1. **Chunking** (optional): The HTML or markdown is split into smaller segments if it’s very long (based on `chunk_token_threshold`, overlap, etc.). +2. **Prompt Construction**: For each chunk, the library forms a prompt that includes your **`instruction`** (and possibly schema or examples). +3. **LLM Inference**: Each chunk is sent to the model in parallel or sequentially (depending on your concurrency). +4. **Combining**: The results from each chunk are merged and parsed into JSON. + +### 3.2 `extraction_type` + +- **`"schema"`**: The model tries to return JSON conforming to your Pydantic-based schema. +- **`"block"`**: The model returns freeform text, or smaller JSON structures, which the library collects. + +For structured data, `"schema"` is recommended. You provide `schema=YourPydanticModel.model_json_schema()`. + +--- + +## 4. Key Parameters + +Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`. + +1. **`provider`** (str): e.g., `"openai/gpt-4"`, `"ollama/llama2"`. +2. **`api_token`** (str): The API key or token for that model. May not be needed for local models. +3. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`. +4. **`extraction_type`** (str): `"schema"` or `"block"`. +5. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.” +6. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM. +7. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity. +8. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`. +9. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include: + - `"markdown"`: The raw markdown (default). + - `"fit_markdown"`: The filtered “fit” markdown if you used a content filter. + - `"html"`: The cleaned or raw HTML. +10. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc. +11. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known). + +**Example**: + +```python +extraction_strategy = LLMExtractionStrategy( + provider="openai/gpt-4", + api_token="YOUR_OPENAI_KEY", + schema=MyModel.model_json_schema(), + extraction_type="schema", + instruction="Extract a list of items from the text with 'name' and 'price' fields.", + chunk_token_threshold=1200, + overlap_rate=0.1, + apply_chunking=True, + input_format="html", + extra_args={"temperature": 0.1, "max_tokens": 1000}, + verbose=True +) +``` + +--- + +## 5. Putting It in `CrawlerRunConfig` + +**Important**: In Crawl4AI, all strategy definitions should go inside the `CrawlerRunConfig`, not directly as a param in `arun()`. Here’s a full example: + +```python +import os +import asyncio +import json +from pydantic import BaseModel, Field +from typing import List +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +class Product(BaseModel): + name: str + price: str + +async def main(): + # 1. Define the LLM extraction strategy + llm_strategy = LLMExtractionStrategy( + provider="openai/gpt-4o-mini", # e.g. "ollama/llama2" + api_token=os.getenv('OPENAI_API_KEY'), + schema=Product.schema_json(), # Or use model_json_schema() + extraction_type="schema", + instruction="Extract all product objects with 'name' and 'price' from the content.", + chunk_token_threshold=1000, + overlap_rate=0.0, + apply_chunking=True, + input_format="markdown", # or "html", "fit_markdown" + extra_args={"temperature": 0.0, "max_tokens": 800} + ) + + # 2. Build the crawler config + crawl_config = CrawlerRunConfig( + extraction_strategy=llm_strategy, + cache_mode=CacheMode.BYPASS + ) + + # 3. Create a browser config if needed + browser_cfg = BrowserConfig(headless=True) + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + # 4. Let's say we want to crawl a single page + result = await crawler.arun( + url="https://example.com/products", + config=crawl_config + ) + + if result.success: + # 5. The extracted content is presumably JSON + data = json.loads(result.extracted_content) + print("Extracted items:", data) + + # 6. Show usage stats + llm_strategy.show_usage() # prints token usage + else: + print("Error:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +## 6. Chunking Details + +### 6.1 `chunk_token_threshold` + +If your page is large, you might exceed your LLM’s context window. **`chunk_token_threshold`** sets the approximate max tokens per chunk. The library calculates word→token ratio using `word_token_rate` (often ~0.75 by default). If chunking is enabled (`apply_chunking=True`), the text is split into segments. + +### 6.2 `overlap_rate` + +To keep context continuous across chunks, we can overlap them. E.g., `overlap_rate=0.1` means each subsequent chunk includes 10% of the previous chunk’s text. This is helpful if your needed info might straddle chunk boundaries. + +### 6.3 Performance & Parallelism + +By chunking, you can potentially process multiple chunks in parallel (depending on your concurrency settings and the LLM provider). This reduces total time if the site is huge or has many sections. + +--- + +## 7. Input Format + +By default, **LLMExtractionStrategy** uses `input_format="markdown"`, meaning the **crawler’s final markdown** is fed to the LLM. You can change to: + +- **`html`**: The cleaned HTML or raw HTML (depending on your crawler config) goes into the LLM. +- **`fit_markdown`**: If you used, for instance, `PruningContentFilter`, the “fit” version of the markdown is used. This can drastically reduce tokens if you trust the filter. +- **`markdown`**: Standard markdown output from the crawler’s `markdown_generator`. + +This setting is crucial: if the LLM instructions rely on HTML tags, pick `"html"`. If you prefer a text-based approach, pick `"markdown"`. + +```python +LLMExtractionStrategy( + # ... + input_format="html", # Instead of "markdown" or "fit_markdown" +) +``` + +--- + +## 8. Token Usage & Show Usage + +To keep track of tokens and cost, each chunk is processed with an LLM call. We record usage in: + +- **`usages`** (list): token usage per chunk or call. +- **`total_usage`**: sum of all chunk calls. +- **`show_usage()`**: prints a usage report (if the provider returns usage data). + +```python +llm_strategy = LLMExtractionStrategy(...) +# ... +llm_strategy.show_usage() +# e.g. “Total usage: 1241 tokens across 2 chunk calls” +``` + +If your model provider doesn’t return usage info, these fields might be partial or empty. + +--- + +## 9. Example: Building a Knowledge Graph + +Below is a snippet combining **`LLMExtractionStrategy`** with a Pydantic schema for a knowledge graph. Notice how we pass an **`instruction`** telling the model what to parse. + +```python +import os +import json +import asyncio +from typing import List +from pydantic import BaseModel, Field +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +class Entity(BaseModel): + name: str + description: str + +class Relationship(BaseModel): + entity1: Entity + entity2: Entity + description: str + relation_type: str + +class KnowledgeGraph(BaseModel): + entities: List[Entity] + relationships: List[Relationship] + +async def main(): + # LLM extraction strategy + llm_strat = LLMExtractionStrategy( + provider="openai/gpt-4", + api_token=os.getenv('OPENAI_API_KEY'), + schema=KnowledgeGraph.schema_json(), + extraction_type="schema", + instruction="Extract entities and relationships from the content. Return valid JSON.", + chunk_token_threshold=1400, + apply_chunking=True, + input_format="html", + extra_args={"temperature": 0.1, "max_tokens": 1500} + ) + + crawl_config = CrawlerRunConfig( + extraction_strategy=llm_strat, + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + # Example page + url = "https://www.nbcnews.com/business" + result = await crawler.arun(url=url, config=crawl_config) + + if result.success: + with open("kb_result.json", "w", encoding="utf-8") as f: + f.write(result.extracted_content) + llm_strat.show_usage() + else: + print("Crawl failed:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key Observations**: + +- **`extraction_type="schema"`** ensures we get JSON fitting our `KnowledgeGraph`. +- **`input_format="html"`** means we feed HTML to the model. +- **`instruction`** guides the model to output a structured knowledge graph. + +--- + +## 10. Best Practices & Caveats + +1. **Cost & Latency**: LLM calls can be slow or expensive. Consider chunking or smaller coverage if you only need partial data. +2. **Model Token Limits**: If your page + instruction exceed the context window, chunking is essential. +3. **Instruction Engineering**: Well-crafted instructions can drastically improve output reliability. +4. **Schema Strictness**: `"schema"` extraction tries to parse the model output as JSON. If the model returns invalid JSON, partial extraction might happen, or you might get an error. +5. **Parallel vs. Serial**: The library can process multiple chunks in parallel, but you must watch out for rate limits on certain providers. +6. **Check Output**: Sometimes, an LLM might omit fields or produce extraneous text. You may want to post-validate with Pydantic or do additional cleanup. + +--- + +## 11. Conclusion + +**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LightLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind: + +- Put your LLM strategy **in `CrawlerRunConfig`**. +- Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees. +- Tweak **`chunk_token_threshold`**, **`overlap_rate`**, and **`apply_chunking`** to handle large content efficiently. +- Monitor token usage with `show_usage()`. + +If your site’s data is consistent or repetitive, consider [`JsonCssExtractionStrategy`](./json-extraction-basic.md) first for speed and simplicity. But if you need an **AI-driven** approach, `LLMExtractionStrategy` offers a flexible, multi-provider solution for extracting structured JSON from any website. + +**Next Steps**: + +1. **Experiment with Different Providers** + - Try switching the `provider` (e.g., `"ollama/llama2"`, `"openai/gpt-4o"`, etc.) to see differences in speed, accuracy, or cost. + - Pass different `extra_args` like `temperature`, `top_p`, and `max_tokens` to fine-tune your results. + +2. **Combine With Other Strategies** + - Use [content filters](../../how-to/content-filters.md) like BM25 or Pruning prior to LLM extraction to remove noise and reduce token usage. + - Apply a [CSS or XPath extraction strategy](./json-extraction-basic.md) first for obvious, structured data, then send only the tricky parts to the LLM. + +3. **Performance Tuning** + - If pages are large, tweak `chunk_token_threshold`, `overlap_rate`, or `apply_chunking` to optimize throughput. + - Check the usage logs with `show_usage()` to keep an eye on token consumption and identify potential bottlenecks. + +4. **Validate Outputs** + - If using `extraction_type="schema"`, parse the LLM’s JSON with a Pydantic model for a final validation step. + - Log or handle any parse errors gracefully, especially if the model occasionally returns malformed JSON. + +5. **Explore Hooks & Automation** + - Integrate LLM extraction with [hooks](./hooks-custom.md) for complex pre/post-processing. + - Use a multi-step pipeline: crawl, filter, LLM-extract, then store or index results for further analysis. + +6. **Scale and Deploy** + - Combine your LLM extraction setup with [Docker or other deployment solutions](./docker-quickstart.md) to run at scale. + - Monitor memory usage and concurrency if you call LLMs frequently. + +**Last Updated**: 2024-XX-XX + +--- + +That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling! \ No newline at end of file diff --git a/docs/md_v3/tutorials/link-media-analysis.md b/docs/md_v3/tutorials/link-media-analysis.md new file mode 100644 index 0000000000000000000000000000000000000000..229fad8d8ade2ad2148cb75c3c9892abce3d4aaf --- /dev/null +++ b/docs/md_v3/tutorials/link-media-analysis.md @@ -0,0 +1,295 @@ +Below is a **draft** of the **“Link & Media Analysis”** tutorial. It demonstrates how to access and filter links, handle domain restrictions, and manage media (especially images) using Crawl4AI’s configuration options. Feel free to adjust examples and text to match your exact workflow or preferences. + +--- + +# Link & Media Analysis + +In this tutorial, you’ll learn how to: + +1. Extract links (internal, external) from crawled pages +2. Filter or exclude specific domains (e.g., social media or custom domains) +3. Access and manage media data (especially images) in the crawl result +4. Configure your crawler to exclude or prioritize certain images + +> **Prerequisites** +> - You have completed or are familiar with the [AsyncWebCrawler Basics](./async-webcrawler-basics.md) tutorial. +> - You can run Crawl4AI in your environment (Playwright, Python, etc.). + +--- + +Below is a revised version of the **Link Extraction** and **Media Extraction** sections that includes example data structures showing how links and media items are stored in `CrawlResult`. Feel free to adjust any field names or descriptions to match your actual output. + +--- + +## 1. Link Extraction + +### 1.1 `result.links` + +When you call `arun()` or `arun_many()` on a URL, Crawl4AI automatically extracts links and stores them in the `links` field of `CrawlResult`. By default, the crawler tries to distinguish **internal** links (same domain) from **external** links (different domains). + +**Basic Example**: + +```python +from crawl4ai import AsyncWebCrawler + +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://www.example.com") + if result.success: + internal_links = result.links.get("internal", []) + external_links = result.links.get("external", []) + print(f"Found {len(internal_links)} internal links, {len(external_links)} external links.") + + # Each link is typically a dictionary with fields like: + # { "href": "...", "text": "...", "title": "...", "base_domain": "..." } + if internal_links: + print("Sample Internal Link:", internal_links[0]) + else: + print("Crawl failed:", result.error_message) +``` + +**Structure Example**: + +```python +result.links = { + "internal": [ + { + "href": "https://kidocode.com/", + "text": "", + "title": "", + "base_domain": "kidocode.com" + }, + { + "href": "https://kidocode.com/degrees/technology", + "text": "Technology Degree", + "title": "KidoCode Tech Program", + "base_domain": "kidocode.com" + }, + # ... + ], + "external": [ + # possibly other links leading to third-party sites + ] +} +``` + +- **`href`**: The raw hyperlink URL. +- **`text`**: The link text (if any) within the `` tag. +- **`title`**: The `title` attribute of the link (if present). +- **`base_domain`**: The domain extracted from `href`. Helpful for filtering or grouping by domain. + +--- + +## 2. Domain Filtering + +Some websites contain hundreds of third-party or affiliate links. You can filter out certain domains at **crawl time** by configuring the crawler. The most relevant parameters in `CrawlerRunConfig` are: + +- **`exclude_external_links`**: If `True`, discard any link pointing outside the root domain. +- **`exclude_social_media_domains`**: Provide a list of social media platforms (e.g., `["facebook.com", "twitter.com"]`) to exclude from your crawl. +- **`exclude_social_media_links`**: If `True`, automatically skip known social platforms. +- **`exclude_domains`**: Provide a list of custom domains you want to exclude (e.g., `["spammyads.com", "tracker.net"]`). + +### 2.1 Example: Excluding External & Social Media Links + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + crawler_cfg = CrawlerRunConfig( + exclude_external_links=True, # No links outside primary domain + exclude_social_media_links=True # Skip recognized social media domains + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + "https://www.example.com", + config=crawler_cfg + ) + if result.success: + print("[OK] Crawled:", result.url) + print("Internal links count:", len(result.links.get("internal", []))) + print("External links count:", len(result.links.get("external", []))) + # Likely zero external links in this scenario + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### 2.2 Example: Excluding Specific Domains + +If you want to let external links in, but specifically exclude a domain (e.g., `suspiciousads.com`), do this: + +```python +crawler_cfg = CrawlerRunConfig( + exclude_domains=["suspiciousads.com"] +) +``` + +This approach is handy when you still want external links but need to block certain sites you consider spammy. + +--- + +## 3. Media Extraction + +### 3.1 Accessing `result.media` + +By default, Crawl4AI collects images, audio, and video URLs it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`). + +**Basic Example**: + +```python +if result.success: + images_info = result.media.get("images", []) + print(f"Found {len(images_info)} images in total.") + for i, img in enumerate(images_info[:5]): # Inspect just the first 5 + print(f"[Image {i}] URL: {img['src']}") + print(f" Alt text: {img.get('alt', '')}") + print(f" Score: {img.get('score')}") + print(f" Description: {img.get('desc', '')}\n") +``` + +**Structure Example**: + +```python +result.media = { + "images": [ + { + "src": "https://cdn.prod.website-files.com/.../Group%2089.svg", + "alt": "coding school for kids", + "desc": "Trial Class Degrees degrees All Degrees AI Degree Technology ...", + "score": 3, + "type": "image", + "group_id": 0, + "format": None, + "width": None, + "height": None + }, + # ... + ], + "videos": [ + # Similar structure but with video-specific fields + ], + "audio": [ + # Similar structure but with audio-specific fields + ] +} +``` + +Depending on your Crawl4AI version or scraping strategy, these dictionaries can include fields like: + +- **`src`**: The media URL (e.g., image source) +- **`alt`**: The alt text for images (if present) +- **`desc`**: A snippet of nearby text or a short description (optional) +- **`score`**: A heuristic relevance score if you’re using content-scoring features +- **`width`**, **`height`**: If the crawler detects dimensions for the image/video +- **`type`**: Usually `"image"`, `"video"`, or `"audio"` +- **`group_id`**: If you’re grouping related media items, the crawler might assign an ID + +With these details, you can easily filter out or focus on certain images (for instance, ignoring images with very low scores or a different domain), or gather metadata for analytics. + +### 3.2 Excluding External Images + +If you’re dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on: + +```python +crawler_cfg = CrawlerRunConfig( + exclude_external_images=True +) +``` + +This setting attempts to discard images from outside the primary domain, keeping only those from the site you’re crawling. + +### 3.3 Additional Media Config + +- **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`. +- **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`. +- **`wait_for_images`**: If `True`, attempts to wait until images are fully loaded before final extraction. + +--- + +## 4. Putting It All Together: Link & Media Filtering + +Here’s a combined example demonstrating how to filter out external links, skip certain domains, and exclude external images: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + # Suppose we want to keep only internal links, remove certain domains, + # and discard external images from the final crawl data. + crawler_cfg = CrawlerRunConfig( + exclude_external_links=True, + exclude_domains=["spammyads.com"], + exclude_social_media_links=True, # skip Twitter, Facebook, etc. + exclude_external_images=True, # keep only images from main domain + wait_for_images=True, # ensure images are loaded + verbose=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://www.example.com", config=crawler_cfg) + + if result.success: + print("[OK] Crawled:", result.url) + + # 1. Links + in_links = result.links.get("internal", []) + ext_links = result.links.get("external", []) + print("Internal link count:", len(in_links)) + print("External link count:", len(ext_links)) # should be zero with exclude_external_links=True + + # 2. Images + images = result.media.get("images", []) + print("Images found:", len(images)) + + # Let's see a snippet of these images + for i, img in enumerate(images[:3]): + print(f" - {img['src']} (alt={img.get('alt','')}, score={img.get('score','N/A')})") + else: + print("[ERROR] Failed to crawl. Reason:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +## 5. Common Pitfalls & Tips + +1. **Conflicting Flags**: + - `exclude_external_links=True` but then also specifying `exclude_social_media_links=True` is typically fine, but understand that the first setting already discards *all* external links. The second becomes somewhat redundant. + - `exclude_external_images=True` but want to keep some external images? Currently no partial domain-based setting for images, so you might need a custom approach or hook logic. + +2. **Relevancy Scores**: + - If your version of Crawl4AI or your scraping strategy includes an `img["score"]`, it’s typically a heuristic based on size, position, or content analysis. Evaluate carefully if you rely on it. + +3. **Performance**: + - Excluding certain domains or external images can speed up your crawl, especially for large, media-heavy pages. + - If you want a “full” link map, do *not* exclude them. Instead, you can post-filter in your own code. + +4. **Social Media Lists**: + - `exclude_social_media_links=True` typically references an internal list of known social domains like Facebook, Twitter, LinkedIn, etc. If you need to add or remove from that list, look for library settings or a local config file (depending on your version). + +--- + +## 6. Next Steps + +Now that you understand how to manage **Link & Media Analysis**, you can: + +- Fine-tune which links are stored or discarded in your final results +- Control which images (or other media) appear in `result.media` +- Filter out entire domains or social media platforms to keep your dataset relevant + +**Recommended Follow-Ups**: +- **[Advanced Features (Proxy, PDF, Screenshots)](./advanced-features.md)**: If you want to capture screenshots or save the page as a PDF for archival or debugging. +- **[Hooks & Custom Code](./hooks-custom.md)**: For more specialized logic, such as automated “infinite scroll” or repeated “Load More” button clicks. +- **Reference**: Check out [CrawlerRunConfig Reference](../../reference/configuration.md) for a comprehensive parameter list. + +**Last updated**: 2024-XX-XX + +--- + +**That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project. \ No newline at end of file diff --git a/docs/md_v3/tutorials/markdown-basics.md b/docs/md_v3/tutorials/markdown-basics.md new file mode 100644 index 0000000000000000000000000000000000000000..48498709e38a2edeb345f9e4da5818d94e18eebf --- /dev/null +++ b/docs/md_v3/tutorials/markdown-basics.md @@ -0,0 +1,382 @@ +Below is a **draft** of the **Markdown Generation Basics** tutorial that incorporates your current Crawl4AI design and terminology. It introduces the default markdown generator, explains the concept of content filters (BM25 and Pruning), and covers the `MarkdownGenerationResult` object in a coherent, step-by-step manner. Adjust parameters or naming as needed to align with your actual codebase. + +--- + +# Markdown Generation Basics + +One of Crawl4AI’s core features is generating **clean, structured markdown** from web pages. Originally built to solve the problem of extracting only the “actual” content and discarding boilerplate or noise, Crawl4AI’s markdown system remains one of its biggest draws for AI workflows. + +In this tutorial, you’ll learn: + +1. How to configure the **Default Markdown Generator** +2. How **content filters** (BM25 or Pruning) help you refine markdown and discard junk +3. The difference between raw markdown (`result.markdown`) and filtered markdown (`fit_markdown`) + +> **Prerequisites** +> - You’ve completed or read [AsyncWebCrawler Basics](./async-webcrawler-basics.md) to understand how to run a simple crawl. +> - You know how to configure `CrawlerRunConfig`. + +--- + +## 1. Quick Example + +Here’s a minimal code snippet that uses the **DefaultMarkdownGenerator** with no additional filtering: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def main(): + config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator() + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=config) + + if result.success: + print("Raw Markdown Output:\n") + print(result.markdown) # The unfiltered markdown from the page + else: + print("Crawl failed:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**What’s happening?** +- `CrawlerRunConfig(markdown_generator=DefaultMarkdownGenerator())` instructs Crawl4AI to convert the final HTML into markdown at the end of each crawl. +- The resulting markdown is accessible via `result.markdown`. + +--- + +## 2. How Markdown Generation Works + +### 2.1 HTML-to-Text Conversion (Forked & Modified) + +Under the hood, **DefaultMarkdownGenerator** uses a specialized HTML-to-text approach that: + +- Preserves headings, code blocks, bullet points, etc. +- Removes extraneous tags (scripts, styles) that don’t add meaningful content. +- Can optionally generate references for links or skip them altogether. + +A set of **options** (passed as a dict) allows you to customize precisely how HTML converts to markdown. These map to standard html2text-like configuration plus your own enhancements (e.g., ignoring internal links, preserving certain tags verbatim, or adjusting line widths). + +### 2.2 Link Citations & References + +By default, the generator can convert `` elements into `[text][1]` citations, then place the actual links at the bottom of the document. This is handy for research workflows that demand references in a structured manner. + +### 2.3 Optional Content Filters + +Before or after the HTML-to-Markdown step, you can apply a **content filter** (like BM25 or Pruning) to reduce noise and produce a “fit_markdown”—a heavily pruned version focusing on the page’s main text. We’ll cover these filters shortly. + +--- + +## 3. Configuring the Default Markdown Generator + +You can tweak the output by passing an `options` dict to `DefaultMarkdownGenerator`. For example: + +```python +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async def main(): + # Example: ignore all links, don't escape HTML, and wrap text at 80 characters + md_generator = DefaultMarkdownGenerator( + options={ + "ignore_links": True, + "escape_html": False, + "body_width": 80 + } + ) + + config = CrawlerRunConfig( + markdown_generator=md_generator + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/docs", config=config) + if result.success: + print("Markdown:\n", result.markdown[:500]) # Just a snippet + else: + print("Crawl failed:", result.error_message) + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) +``` + +Some commonly used `options`: + +- **`ignore_links`** (bool): Whether to remove all hyperlinks in the final markdown. +- **`ignore_images`** (bool): Remove all `![image]()` references. +- **`escape_html`** (bool): Turn HTML entities into text (default is often `True`). +- **`body_width`** (int): Wrap text at N characters. `0` or `None` means no wrapping. +- **`skip_internal_links`** (bool): If `True`, omit `#localAnchors` or internal links referencing the same page. +- **`include_sup_sub`** (bool): Attempt to handle `` / `` in a more readable way. + +--- + +## 4. Content Filters + +**Content filters** selectively remove or rank sections of text before turning them into Markdown. This is especially helpful if your page has ads, nav bars, or other clutter you don’t want. + +### 4.1 BM25ContentFilter + +If you have a **search query**, BM25 is a good choice: + +```python +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import BM25ContentFilter +from crawl4ai import CrawlerRunConfig + +bm25_filter = BM25ContentFilter( + user_query="machine learning", + bm25_threshold=1.2, + use_stemming=True +) + +md_generator = DefaultMarkdownGenerator( + content_filter=bm25_filter, + options={"ignore_links": True} +) + +config = CrawlerRunConfig(markdown_generator=md_generator) +``` + +- **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query. +- **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more. +- **`use_stemming`**: If `True`, variations of words match (e.g., “learn,” “learning,” “learnt”). + +**No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results. + +### 4.2 PruningContentFilter + +If you **don’t** have a specific query, or if you just want a robust “junk remover,” use `PruningContentFilter`. It analyzes text density, link density, HTML structure, and known patterns (like “nav,” “footer”) to systematically prune extraneous or repetitive sections. + +```python +from crawl4ai.content_filter_strategy import PruningContentFilter + +prune_filter = PruningContentFilter( + threshold=0.5, + threshold_type="fixed", # or "dynamic" + min_word_threshold=50 +) +``` + +- **`threshold`**: Score boundary. Blocks below this score get removed. +- **`threshold_type`**: + - `"fixed"`: Straight comparison (`score >= threshold` keeps the block). + - `"dynamic"`: The filter adjusts threshold in a data-driven manner. +- **`min_word_threshold`**: Discard blocks under N words as likely too short or unhelpful. + +**When to Use PruningContentFilter** +- You want a broad cleanup without a user query. +- The page has lots of repeated sidebars, footers, or disclaimers that hamper text extraction. + +--- + +## 5. Using Fit Markdown + +When a content filter is active, the library produces two forms of markdown inside `result.markdown_v2` or (if using the simplified field) `result.markdown`: + +1. **`raw_markdown`**: The full unfiltered markdown. +2. **`fit_markdown`**: A “fit” version where the filter has removed or trimmed noisy segments. + +**Note**: +- In earlier examples, you may see references to `result.markdown_v2`. Depending on your library version, you might access `result.markdown`, `result.markdown_v2`, or an object named `MarkdownGenerationResult`. The idea is the same: you’ll have a raw version and a filtered (“fit”) version if a filter is used. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import PruningContentFilter + +async def main(): + config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.6), + options={"ignore_links": True} + ) + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://news.example.com/tech", config=config) + if result.success: + print("Raw markdown:\n", result.markdown) + + # If a filter is used, we also have .fit_markdown: + md_object = result.markdown_v2 # or your equivalent + print("Filtered markdown:\n", md_object.fit_markdown) + else: + print("Crawl failed:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +## 6. The `MarkdownGenerationResult` Object + +If your library stores detailed markdown output in an object like `MarkdownGenerationResult`, you’ll see fields such as: + +- **`raw_markdown`**: The direct HTML-to-markdown transformation (no filtering). +- **`markdown_with_citations`**: A version that moves links to reference-style footnotes. +- **`references_markdown`**: A separate string or section containing the gathered references. +- **`fit_markdown`**: The filtered markdown if you used a content filter. +- **`fit_html`**: The corresponding HTML snippet used to generate `fit_markdown` (helpful for debugging or advanced usage). + +**Example**: + +```python +md_obj = result.markdown_v2 # your library’s naming may vary +print("RAW:\n", md_obj.raw_markdown) +print("CITED:\n", md_obj.markdown_with_citations) +print("REFERENCES:\n", md_obj.references_markdown) +print("FIT:\n", md_obj.fit_markdown) +``` + +**Why Does This Matter?** +- You can supply `raw_markdown` to an LLM if you want the entire text. +- Or feed `fit_markdown` into a vector database to reduce token usage. +- `references_markdown` can help you keep track of link provenance. + +--- + +Below is a **revised section** under “Combining Filters (BM25 + Pruning)” that demonstrates how you can run **two** passes of content filtering without re-crawling, by taking the HTML (or text) from a first pass and feeding it into the second filter. It uses real code patterns from the snippet you provided for **BM25ContentFilter**, which directly accepts **HTML** strings (and can also handle plain text with minimal adaptation). + +--- + +## 7. Combining Filters (BM25 + Pruning) in Two Passes + +You might want to **prune out** noisy boilerplate first (with `PruningContentFilter`), and then **rank what’s left** against a user query (with `BM25ContentFilter`). You don’t have to crawl the page twice. Instead: + +1. **First pass**: Apply `PruningContentFilter` directly to the raw HTML from `result.html` (the crawler’s downloaded HTML). +2. **Second pass**: Take the pruned HTML (or text) from step 1, and feed it into `BM25ContentFilter`, focusing on a user query. + +### Two-Pass Example + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter +from bs4 import BeautifulSoup + +async def main(): + # 1. Crawl with minimal or no markdown generator, just get raw HTML + config = CrawlerRunConfig( + # If you only want raw HTML, you can skip passing a markdown_generator + # or provide one but focus on .html in this example + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/tech-article", config=config) + + if not result.success or not result.html: + print("Crawl failed or no HTML content.") + return + + raw_html = result.html + + # 2. First pass: PruningContentFilter on raw HTML + pruning_filter = PruningContentFilter(threshold=0.5, min_word_threshold=50) + + # filter_content returns a list of "text chunks" or cleaned HTML sections + pruned_chunks = pruning_filter.filter_content(raw_html) + # This list is basically pruned content blocks, presumably in HTML or text form + + # For demonstration, let's combine these chunks back into a single HTML-like string + # or you could do further processing. It's up to your pipeline design. + pruned_html = "\n".join(pruned_chunks) + + # 3. Second pass: BM25ContentFilter with a user query + bm25_filter = BM25ContentFilter( + user_query="machine learning", + bm25_threshold=1.2, + language="english" + ) + + bm25_chunks = bm25_filter.filter_content(pruned_html) # returns a list of text chunks + + if not bm25_chunks: + print("Nothing matched the BM25 query after pruning.") + return + + # 4. Combine or display final results + final_text = "\n---\n".join(bm25_chunks) + + print("==== PRUNED OUTPUT (first pass) ====") + print(pruned_html[:500], "... (truncated)") # preview + + print("\n==== BM25 OUTPUT (second pass) ====") + print(final_text[:500], "... (truncated)") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### What’s Happening? + +1. **Raw HTML**: We crawl once and store the raw HTML in `result.html`. +2. **PruningContentFilter**: Takes HTML + optional parameters. It extracts blocks of text or partial HTML, removing headings/sections deemed “noise.” It returns a **list of text chunks**. +3. **Combine or Transform**: We join these pruned chunks back into a single HTML-like string. (Alternatively, you could store them in a list for further logic—whatever suits your pipeline.) +4. **BM25ContentFilter**: We feed the pruned string into `BM25ContentFilter` with a user query. This second pass further narrows the content to chunks relevant to “machine learning.” + +**No Re-Crawling**: We used `raw_html` from the first pass, so there’s no need to run `arun()` again—**no second network request**. + +### Tips & Variations + +- **Plain Text vs. HTML**: If your pruned output is mostly text, BM25 can still handle it; just keep in mind it expects a valid string input. If you supply partial HTML (like `"

    some text

    "`), it will parse it as HTML. +- **Chaining in a Single Pipeline**: If your code supports it, you can chain multiple filters automatically. Otherwise, manual two-pass filtering (as shown) is straightforward. +- **Adjust Thresholds**: If you see too much or too little text in step one, tweak `threshold=0.5` or `min_word_threshold=50`. Similarly, `bm25_threshold=1.2` can be raised/lowered for more or fewer chunks in step two. + +### One-Pass Combination? + +If your codebase or pipeline design allows applying multiple filters in one pass, you could do so. But often it’s simpler—and more transparent—to run them sequentially, analyzing each step’s result. + +**Bottom Line**: By **manually chaining** your filtering logic in two passes, you get powerful incremental control over the final content. First, remove “global” clutter with Pruning, then refine further with BM25-based query relevance—without incurring a second network crawl. + +--- + +## 8. Common Pitfalls & Tips + +1. **No Markdown Output?** + - Make sure the crawler actually retrieved HTML. If the site is heavily JS-based, you may need to enable dynamic rendering or wait for elements. + - Check if your content filter is too aggressive. Lower thresholds or disable the filter to see if content reappears. + +2. **Performance Considerations** + - Very large pages with multiple filters can be slower. Consider `cache_mode` to avoid re-downloading. + - If your final use case is LLM ingestion, consider summarizing further or chunking big texts. + +3. **Take Advantage of `fit_markdown`** + - Great for RAG pipelines, semantic search, or any scenario where extraneous boilerplate is unwanted. + - Still verify the textual quality—some sites have crucial data in footers or sidebars. + +4. **Adjusting `html2text` Options** + - If you see lots of raw HTML slipping into the text, turn on `escape_html`. + - If code blocks look messy, experiment with `mark_code` or `handle_code_in_pre`. + +--- + +## 9. Summary & Next Steps + +In this **Markdown Generation Basics** tutorial, you learned to: + +- Configure the **DefaultMarkdownGenerator** with HTML-to-text options. +- Use **BM25ContentFilter** for query-specific extraction or **PruningContentFilter** for general noise removal. +- Distinguish between raw and filtered markdown (`fit_markdown`). +- Leverage the `MarkdownGenerationResult` object to handle different forms of output (citations, references, etc.). + +**Where to go from here**: + +- **[Extracting JSON (No LLM)](./json-extraction-basic.md)**: If you need structured data instead of markdown, check out the library’s JSON extraction strategies. +- **[Advanced Features](./advanced-features.md)**: Combine markdown generation with proxies, PDF exports, and more. +- **[Explanations → Content Filters vs. Extraction Strategies](../../explanations/extraction-chunking.md)**: Dive deeper into how filters differ from chunking or semantic extraction. + +Now you can produce high-quality Markdown from any website, focusing on exactly the content you need—an essential step for powering AI models, summarization pipelines, or knowledge-base queries. + +**Last Updated**: 2024-XX-XX + +--- + +That’s it for **Markdown Generation Basics**! Enjoy generating clean, noise-free markdown for your LLM workflows, content archives, or research. \ No newline at end of file diff --git a/docs/md_v3/tutorials/targeted-crawling.md b/docs/md_v3/tutorials/targeted-crawling.md new file mode 100644 index 0000000000000000000000000000000000000000..f5fe2b77c8dd22281a05e66a47c4b23e19cdbc82 --- /dev/null +++ b/docs/md_v3/tutorials/targeted-crawling.md @@ -0,0 +1,227 @@ +Below is a **draft** of a follow-up tutorial, **“Smart Crawling Techniques,”** building on the **“AsyncWebCrawler Basics”** tutorial. This tutorial focuses on three main points: + +1. **Advanced usage of CSS selectors** (e.g., partial extraction, exclusions) +2. **Handling iframes** (if relevant for your workflow) +3. **Waiting for dynamic content** using `wait_for`, including the new `css:` and `js:` prefixes + +Feel free to adjust code snippets, wording, or emphasis to match your library updates or user feedback. + +--- + +# Smart Crawling Techniques + +In the previous tutorial ([AsyncWebCrawler Basics](./async-webcrawler-basics.md)), you learned how to create an `AsyncWebCrawler` instance, run a basic crawl, and inspect the `CrawlResult`. Now it’s time to explore some of the **targeted crawling** features that let you: + +1. Select specific parts of a webpage using CSS selectors +2. Exclude or ignore certain page elements +3. Wait for dynamic content to load using `wait_for` (with `css:` or `js:` rules) +4. (Optionally) Handle iframes if your target site embeds additional content + +> **Prerequisites** +> - You’ve read or completed [AsyncWebCrawler Basics](./async-webcrawler-basics.md). +> - You have a working environment for Crawl4AI (Playwright installed, etc.). + +--- + +## 1. Targeting Specific Elements with CSS Selectors + +### 1.1 Simple CSS Selector Usage + +Let’s say you only need to crawl the main article content of a news page. By setting `css_selector` in `CrawlerRunConfig`, your final HTML or Markdown output focuses on that region. For example: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + browser_cfg = BrowserConfig(headless=True) + crawler_cfg = CrawlerRunConfig( + css_selector=".article-body", # Only capture .article-body content + excluded_tags=["nav", "footer"] # Optional: skip big nav & footer sections + ) + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun( + url="https://news.example.com/story/12345", + config=crawler_cfg + ) + if result.success: + print("[OK] Extracted content length:", len(result.html)) + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key Parameters**: +- **`css_selector`**: Tells the crawler to focus on `.article-body`. +- **`excluded_tags`**: Tells the crawler to skip specific HTML tags altogether (e.g., `nav` or `footer`). + +**Tip**: For extremely noisy pages, you can further refine how you exclude certain elements by using `excluded_selector`, which takes a CSS selector you want removed from the final output. + +### 1.2 Excluding Content with `excluded_selector` + +If you want to remove certain sections within `.article-body` (like “related stories” sidebars), set: + +```python +CrawlerRunConfig( + css_selector=".article-body", + excluded_selector=".related-stories, .ads-banner" +) +``` + +This combination grabs the main article content while filtering out sidebars or ads. + +--- + +## 2. Handling Iframes + +Some sites embed extra content via `