amaye15
test
03c0888

Complete Parameter Guide for arun()

The following parameters can be passed to the arun() method. They are organized by their primary usage context and functionality.

Core Parameters

await crawler.arun(
    url="https://example.com",   # Required: URL to crawl
    verbose=True,               # Enable detailed logging
    cache_mode=CacheMode.ENABLED,  # Control cache behavior
    warmup=True                # Whether to run warmup check
)

Cache Control

from crawl4ai import CacheMode

await crawler.arun(
    cache_mode=CacheMode.ENABLED,    # Normal caching (read/write)
    # Other cache modes:
    # cache_mode=CacheMode.DISABLED   # No caching at all
    # cache_mode=CacheMode.READ_ONLY  # Only read from cache
    # cache_mode=CacheMode.WRITE_ONLY # Only write to cache
    # cache_mode=CacheMode.BYPASS     # Skip cache for this operation
)

Content Processing Parameters

Text Processing

await crawler.arun(
    word_count_threshold=10,                # Minimum words per content block
    image_description_min_word_threshold=5,  # Minimum words for image descriptions
    only_text=False,                        # Extract only text content
    excluded_tags=['form', 'nav'],          # HTML tags to exclude
    keep_data_attributes=False,             # Preserve data-* attributes
)

Content Selection

await crawler.arun(
    css_selector=".main-content",  # CSS selector for content extraction
    remove_forms=True,             # Remove all form elements
    remove_overlay_elements=True,  # Remove popups/modals/overlays
)

Link Handling

await crawler.arun(
    exclude_external_links=True,          # Remove external links
    exclude_social_media_links=True,      # Remove social media links
    exclude_external_images=True,         # Remove external images
    exclude_domains=["ads.example.com"],  # Specific domains to exclude
    social_media_domains=[               # Additional social media domains
        "facebook.com",
        "twitter.com",
        "instagram.com"
    ]
)

Browser Control Parameters

Basic Browser Settings

await crawler.arun(
    headless=True,                # Run browser in headless mode
    browser_type="chromium",      # Browser engine: "chromium", "firefox", "webkit"
    page_timeout=60000,          # Page load timeout in milliseconds
    user_agent="custom-agent",    # Custom user agent
)

Navigation and Waiting

await crawler.arun(
    wait_for="css:.dynamic-content",  # Wait for element/condition
    delay_before_return_html=2.0,     # Wait before returning HTML (seconds)
)

JavaScript Execution

await crawler.arun(
    js_code=[                     # JavaScript to execute (string or list)
        "window.scrollTo(0, document.body.scrollHeight);",
        "document.querySelector('.load-more').click();"
    ],
    js_only=False,               # Only execute JavaScript without reloading page
)

Anti-Bot Features

await crawler.arun(
    magic=True,              # Enable all anti-detection features
    simulate_user=True,      # Simulate human behavior
    override_navigator=True  # Override navigator properties
)

Session Management

await crawler.arun(
    session_id="my_session",  # Session identifier for persistent browsing
)

Screenshot Options

await crawler.arun(
    screenshot=True,              # Take page screenshot
    screenshot_wait_for=2.0,      # Wait before screenshot (seconds)
)

Proxy Configuration

await crawler.arun(
    proxy="http://proxy.example.com:8080",     # Simple proxy URL
    proxy_config={                             # Advanced proxy settings
        "server": "http://proxy.example.com:8080",
        "username": "user",
        "password": "pass"
    }
)

Content Extraction Parameters

Extraction Strategy

await crawler.arun(
    extraction_strategy=LLMExtractionStrategy(
        provider="ollama/llama2",
        schema=MySchema.schema(),
        instruction="Extract specific data"
    )
)

Chunking Strategy

await crawler.arun(
    chunking_strategy=RegexChunking(
        patterns=[r'\n\n', r'\.\s+']
    )
)

HTML to Text Options

await crawler.arun(
    html2text={
        "ignore_links": False,
        "ignore_images": False,
        "escape_dot": False,
        "body_width": 0,
        "protect_links": True,
        "unicode_snob": True
    }
)

Debug Options

await crawler.arun(
    log_console=True,   # Log browser console messages
)

Parameter Interactions and Notes

  1. Cache and Performance Setup

    # Optimal caching for repeated crawls
    await crawler.arun(
        cache_mode=CacheMode.ENABLED,
        word_count_threshold=10,
        process_iframes=False
    )
    
  2. Dynamic Content Handling

    # Handle lazy-loaded content
    await crawler.arun(
        js_code="window.scrollTo(0, document.body.scrollHeight);",
        wait_for="css:.lazy-content",
        delay_before_return_html=2.0,
        cache_mode=CacheMode.WRITE_ONLY  # Cache results after dynamic load
    )
    
  3. Content Extraction Pipeline

    # Complete extraction setup
    await crawler.arun(
        css_selector=".main-content",
        word_count_threshold=20,
        extraction_strategy=my_strategy,
        chunking_strategy=my_chunking,
        process_iframes=True,
        remove_overlay_elements=True,
        cache_mode=CacheMode.ENABLED
    )
    

Best Practices

  1. Performance Optimization

    await crawler.arun(
        cache_mode=CacheMode.ENABLED,  # Use full caching
        word_count_threshold=10,      # Filter out noise
        process_iframes=False         # Skip iframes if not needed
    )
    
  2. Reliable Scraping

    await crawler.arun(
        magic=True,                   # Enable anti-detection
        delay_before_return_html=1.0, # Wait for dynamic content
        page_timeout=60000,          # Longer timeout for slow pages
        cache_mode=CacheMode.WRITE_ONLY  # Cache results after successful crawl
    )
    
  3. Clean Content

    await crawler.arun(
        remove_overlay_elements=True,  # Remove popups
        excluded_tags=['nav', 'aside'],# Remove unnecessary elements
        keep_data_attributes=False,    # Remove data attributes
        cache_mode=CacheMode.ENABLED   # Use cache for faster processing
    )