Complete Parameter Guide for arun()

The following parameters can be passed to the arun() method. They are organized by their primary usage context and functionality.

Core Parameters

await crawler.arun(
    url="https://example.com",   # Required: URL to crawl
    verbose=True,               # Enable detailed logging
    cache_mode=CacheMode.ENABLED,  # Control cache behavior
    warmup=True                # Whether to run warmup check
)

Cache Control

from crawl4ai import CacheMode

await crawler.arun(
    cache_mode=CacheMode.ENABLED,    # Normal caching (read/write)
    # Other cache modes:
    # cache_mode=CacheMode.DISABLED   # No caching at all
    # cache_mode=CacheMode.READ_ONLY  # Only read from cache
    # cache_mode=CacheMode.WRITE_ONLY # Only write to cache
    # cache_mode=CacheMode.BYPASS     # Skip cache for this operation
)

Content Processing Parameters

Text Processing

await crawler.arun(
    word_count_threshold=10,                # Minimum words per content block
    image_description_min_word_threshold=5,  # Minimum words for image descriptions
    only_text=False,                        # Extract only text content
    excluded_tags=['form', 'nav'],          # HTML tags to exclude
    keep_data_attributes=False,             # Preserve data-* attributes
)

Content Selection

await crawler.arun(
    css_selector=".main-content",  # CSS selector for content extraction
    remove_forms=True,             # Remove all form elements
    remove_overlay_elements=True,  # Remove popups/modals/overlays
)

Link Handling

await crawler.arun(
    exclude_external_links=True,          # Remove external links
    exclude_social_media_links=True,      # Remove social media links
    exclude_external_images=True,         # Remove external images
    exclude_domains=["ads.example.com"],  # Specific domains to exclude
    social_media_domains=[               # Additional social media domains
        "facebook.com",
        "twitter.com",
        "instagram.com"
    ]
)

Browser Control Parameters

Basic Browser Settings

await crawler.arun(
    headless=True,                # Run browser in headless mode
    browser_type="chromium",      # Browser engine: "chromium", "firefox", "webkit"
    page_timeout=60000,          # Page load timeout in milliseconds
    user_agent="custom-agent",    # Custom user agent
)

Navigation and Waiting

await crawler.arun(
    wait_for="css:.dynamic-content",  # Wait for element/condition
    delay_before_return_html=2.0,     # Wait before returning HTML (seconds)
)

JavaScript Execution

await crawler.arun(
    js_code=[                     # JavaScript to execute (string or list)
        "window.scrollTo(0, document.body.scrollHeight);",
        "document.querySelector('.load-more').click();"
    ],
    js_only=False,               # Only execute JavaScript without reloading page
)

Anti-Bot Features

await crawler.arun(
    magic=True,              # Enable all anti-detection features
    simulate_user=True,      # Simulate human behavior
    override_navigator=True  # Override navigator properties
)

Session Management

await crawler.arun(
    session_id="my_session",  # Session identifier for persistent browsing
)

Screenshot Options

await crawler.arun(
    screenshot=True,              # Take page screenshot
    screenshot_wait_for=2.0,      # Wait before screenshot (seconds)
)

Proxy Configuration

await crawler.arun(
    proxy="http://proxy.example.com:8080",     # Simple proxy URL
    proxy_config={                             # Advanced proxy settings
        "server": "http://proxy.example.com:8080",
        "username": "user",
        "password": "pass"
    }
)

Content Extraction Parameters

Extraction Strategy

await crawler.arun(
    extraction_strategy=LLMExtractionStrategy(
        provider="ollama/llama2",
        schema=MySchema.schema(),
        instruction="Extract specific data"
    )
)

Chunking Strategy

await crawler.arun(
    chunking_strategy=RegexChunking(
        patterns=[r'\n\n', r'\.\s+']
    )
)

HTML to Text Options

await crawler.arun(
    html2text={
        "ignore_links": False,
        "ignore_images": False,
        "escape_dot": False,
        "body_width": 0,
        "protect_links": True,
        "unicode_snob": True
    }
)

Debug Options

await crawler.arun(
    log_console=True,   # Log browser console messages
)

Parameter Interactions and Notes

Cache and Performance Setup

# Optimal caching for repeated crawls
await crawler.arun(
    cache_mode=CacheMode.ENABLED,
    word_count_threshold=10,
    process_iframes=False
)

Dynamic Content Handling

# Handle lazy-loaded content
await crawler.arun(
    js_code="window.scrollTo(0, document.body.scrollHeight);",
    wait_for="css:.lazy-content",
    delay_before_return_html=2.0,
    cache_mode=CacheMode.WRITE_ONLY  # Cache results after dynamic load
)

Content Extraction Pipeline

# Complete extraction setup
await crawler.arun(
    css_selector=".main-content",
    word_count_threshold=20,
    extraction_strategy=my_strategy,
    chunking_strategy=my_chunking,
    process_iframes=True,
    remove_overlay_elements=True,
    cache_mode=CacheMode.ENABLED
)

Best Practices

Performance Optimization

await crawler.arun(
    cache_mode=CacheMode.ENABLED,  # Use full caching
    word_count_threshold=10,      # Filter out noise
    process_iframes=False         # Skip iframes if not needed
)

Reliable Scraping

await crawler.arun(
    magic=True,                   # Enable anti-detection
    delay_before_return_html=1.0, # Wait for dynamic content
    page_timeout=60000,          # Longer timeout for slow pages
    cache_mode=CacheMode.WRITE_ONLY  # Cache results after successful crawl
)

Clean Content

await crawler.arun(
    remove_overlay_elements=True,  # Remove popups
    excluded_tags=['nav', 'aside'],# Remove unnecessary elements
    keep_data_attributes=False,    # Remove data attributes
    cache_mode=CacheMode.ENABLED   # Use cache for faster processing
)