s > 9 correctly.
+ parent_list = None
+ for list in self.list:
+ self.o(
+ " " if parent_list == "ol" and list.name == "ul" else " "
+ )
+ parent_list = list.name
+
+ if li.name == "ul":
+ self.o(self.ul_item_mark + " ")
+ elif li.name == "ol":
+ li.num += 1
+ self.o(str(li.num) + ". ")
+ self.start = True
+
+ if tag in ["table", "tr", "td", "th"]:
+ if self.ignore_tables:
+ if tag == "tr":
+ if start:
+ pass
+ else:
+ self.soft_br()
+ else:
+ pass
+
+ elif self.bypass_tables:
+ if start:
+ self.soft_br()
+ if tag in ["td", "th"]:
+ if start:
+ self.o("<{}>\n\n".format(tag))
+ else:
+ self.o("\n{}>".format(tag))
+ else:
+ if start:
+ self.o("<{}>".format(tag))
+ else:
+ self.o("{}>".format(tag))
+
+ else:
+ if tag == "table":
+ if start:
+ self.table_start = True
+ if self.pad_tables:
+ self.o("<" + config.TABLE_MARKER_FOR_PAD + ">")
+ self.o(" \n")
+ else:
+ if self.pad_tables:
+ # add break in case the table is empty or its 1 row table
+ self.soft_br()
+ self.o("" + config.TABLE_MARKER_FOR_PAD + ">")
+ self.o(" \n")
+ if tag in ["td", "th"] and start:
+ if self.split_next_td:
+ self.o("| ")
+ self.split_next_td = True
+
+ if tag == "tr" and start:
+ self.td_count = 0
+ if tag == "tr" and not start:
+ self.split_next_td = False
+ self.soft_br()
+ if tag == "tr" and not start and self.table_start:
+ # Underline table header
+ self.o("|".join(["---"] * self.td_count))
+ self.soft_br()
+ self.table_start = False
+ if tag in ["td", "th"] and start:
+ self.td_count += 1
+
+ if tag == "pre":
+ if start:
+ self.startpre = True
+ self.pre = True
+ else:
+ self.pre = False
+ if self.mark_code:
+ self.out("\n[/code]")
+ self.p()
+
+ if tag in ["sup", "sub"] and self.include_sup_sub:
+ if start:
+ self.o("<{}>".format(tag))
+ else:
+ self.o("{}>".format(tag))
+
+ # TODO: Add docstring for these one letter functions
+ def pbr(self) -> None:
+ "Pretty print has a line break"
+ if self.p_p == 0:
+ self.p_p = 1
+
+ def p(self) -> None:
+ "Set pretty print to 1 or 2 lines"
+ self.p_p = 1 if self.single_line_break else 2
+
+ def soft_br(self) -> None:
+ "Soft breaks"
+ self.pbr()
+ self.br_toggle = " "
+
+ def o(
+ self, data: str, puredata: bool = False, force: Union[bool, str] = False
+ ) -> None:
+ """
+ Deal with indentation and whitespace
+ """
+ if self.abbr_data is not None:
+ self.abbr_data += data
+
+ if not self.quiet:
+ if self.google_doc:
+ # prevent white space immediately after 'begin emphasis'
+ # marks ('**' and '_')
+ lstripped_data = data.lstrip()
+ if self.drop_white_space and not (self.pre or self.code):
+ data = lstripped_data
+ if lstripped_data != "":
+ self.drop_white_space = 0
+
+ if puredata and not self.pre:
+ # This is a very dangerous call ... it could mess up
+ # all handling of when not handled properly
+ # (see entityref)
+ data = re.sub(r"\s+", r" ", data)
+ if data and data[0] == " ":
+ self.space = True
+ data = data[1:]
+ if not data and not force:
+ return
+
+ if self.startpre:
+ # self.out(" :") #TODO: not output when already one there
+ if not data.startswith("\n") and not data.startswith("\r\n"):
+ # stuff...
+ data = "\n" + data
+ if self.mark_code:
+ self.out("\n[code]")
+ self.p_p = 0
+
+ bq = ">" * self.blockquote
+ if not (force and data and data[0] == ">") and self.blockquote:
+ bq += " "
+
+ if self.pre:
+ if not self.list:
+ bq += " "
+ # else: list content is already partially indented
+ bq += " " * len(self.list)
+ data = data.replace("\n", "\n" + bq)
+
+ if self.startpre:
+ self.startpre = False
+ if self.list:
+ # use existing initial indentation
+ data = data.lstrip("\n")
+
+ if self.start:
+ self.space = False
+ self.p_p = 0
+ self.start = False
+
+ if force == "end":
+ # It's the end.
+ self.p_p = 0
+ self.out("\n")
+ self.space = False
+
+ if self.p_p:
+ self.out((self.br_toggle + "\n" + bq) * self.p_p)
+ self.space = False
+ self.br_toggle = ""
+
+ if self.space:
+ if not self.lastWasNL:
+ self.out(" ")
+ self.space = False
+
+ if self.a and (
+ (self.p_p == 2 and self.links_each_paragraph) or force == "end"
+ ):
+ if force == "end":
+ self.out("\n")
+
+ newa = []
+ for link in self.a:
+ if self.outcount > link.outcount:
+ self.out(
+ " ["
+ + str(link.count)
+ + "]: "
+ + urlparse.urljoin(self.baseurl, link.attrs["href"])
+ )
+ if "title" in link.attrs and link.attrs["title"] is not None:
+ self.out(" (" + link.attrs["title"] + ")")
+ self.out("\n")
+ else:
+ newa.append(link)
+
+ # Don't need an extra line when nothing was done.
+ if self.a != newa:
+ self.out("\n")
+
+ self.a = newa
+
+ if self.abbr_list and force == "end":
+ for abbr, definition in self.abbr_list.items():
+ self.out(" *[" + abbr + "]: " + definition + "\n")
+
+ self.p_p = 0
+ self.out(data)
+ self.outcount += 1
+
+ def handle_data(self, data: str, entity_char: bool = False) -> None:
+ if not data:
+ # Data may be empty for some HTML entities. For example,
+ # LEFT-TO-RIGHT MARK.
+ return
+
+ if self.stressed:
+ data = data.strip()
+ self.stressed = False
+ self.preceding_stressed = True
+ elif self.preceding_stressed:
+ if (
+ re.match(r"[^][(){}\s.!?]", data[0])
+ and not hn(self.current_tag)
+ and self.current_tag not in ["a", "code", "pre"]
+ ):
+ # should match a letter or common punctuation
+ data = " " + data
+ self.preceding_stressed = False
+
+ if self.style:
+ self.style_def.update(dumb_css_parser(data))
+
+ if self.maybe_automatic_link is not None:
+ href = self.maybe_automatic_link
+ if (
+ href == data
+ and self.absolute_url_matcher.match(href)
+ and self.use_automatic_links
+ ):
+ self.o("<" + data + ">")
+ self.empty_link = False
+ return
+ else:
+ self.o("[")
+ self.maybe_automatic_link = None
+ self.empty_link = False
+
+ if not self.code and not self.pre and not entity_char:
+ data = escape_md_section(data, snob=self.escape_snob, escape_dot=self.escape_dot, escape_plus=self.escape_plus, escape_dash=self.escape_dash)
+ self.preceding_data = data
+ self.o(data, puredata=True)
+
+ def charref(self, name: str) -> str:
+ if name[0] in ["x", "X"]:
+ c = int(name[1:], 16)
+ else:
+ c = int(name)
+
+ if not self.unicode_snob and c in unifiable_n:
+ return unifiable_n[c]
+ else:
+ try:
+ return chr(c)
+ except ValueError: # invalid unicode
+ return ""
+
+ def entityref(self, c: str) -> str:
+ if not self.unicode_snob and c in config.UNIFIABLE:
+ return config.UNIFIABLE[c]
+ try:
+ ch = html.entities.html5[c + ";"]
+ except KeyError:
+ return "&" + c + ";"
+ return config.UNIFIABLE[c] if c == "nbsp" else ch
+
+ def google_nest_count(self, style: Dict[str, str]) -> int:
+ """
+ Calculate the nesting count of google doc lists
+
+ :type style: dict
+
+ :rtype: int
+ """
+ nest_count = 0
+ if "margin-left" in style:
+ nest_count = int(style["margin-left"][:-2]) // self.google_list_indent
+
+ return nest_count
+
+ def optwrap(self, text: str) -> str:
+ """
+ Wrap all paragraphs in the provided text.
+
+ :type text: str
+
+ :rtype: str
+ """
+ if not self.body_width:
+ return text
+
+ result = ""
+ newlines = 0
+ # I cannot think of a better solution for now.
+ # To avoid the non-wrap behaviour for entire paras
+ # because of the presence of a link in it
+ if not self.wrap_links:
+ self.inline_links = False
+ for para in text.split("\n"):
+ if len(para) > 0:
+ if not skipwrap(
+ para, self.wrap_links, self.wrap_list_items, self.wrap_tables
+ ):
+ indent = ""
+ if para.startswith(" " + self.ul_item_mark):
+ # list item continuation: add a double indent to the
+ # new lines
+ indent = " "
+ elif para.startswith("> "):
+ # blockquote continuation: add the greater than symbol
+ # to the new lines
+ indent = "> "
+ wrapped = wrap(
+ para,
+ self.body_width,
+ break_long_words=False,
+ subsequent_indent=indent,
+ )
+ result += "\n".join(wrapped)
+ if para.endswith(" "):
+ result += " \n"
+ newlines = 1
+ elif indent:
+ result += "\n"
+ newlines = 1
+ else:
+ result += "\n\n"
+ newlines = 2
+ else:
+ # Warning for the tempted!!!
+ # Be aware that obvious replacement of this with
+ # line.isspace()
+ # DOES NOT work! Explanations are welcome.
+ if not config.RE_SPACE.match(para):
+ result += para + "\n"
+ newlines = 1
+ else:
+ if newlines < 2:
+ result += "\n"
+ newlines += 1
+ return result
+
+def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str:
+ if bodywidth is None:
+ bodywidth = config.BODY_WIDTH
+ h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
+
+ return h.handle(html)
+
+class CustomHTML2Text(HTML2Text):
+ def __init__(self, *args, handle_code_in_pre=False, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.inside_pre = False
+ self.inside_code = False
+ self.preserve_tags = set() # Set of tags to preserve
+ self.current_preserved_tag = None
+ self.preserved_content = []
+ self.preserve_depth = 0
+ self.handle_code_in_pre = handle_code_in_pre
+
+ # Configuration options
+ self.skip_internal_links = False
+ self.single_line_break = False
+ self.mark_code = False
+ self.include_sup_sub = False
+ self.body_width = 0
+ self.ignore_mailto_links = True
+ self.ignore_links = False
+ self.escape_backslash = False
+ self.escape_dot = False
+ self.escape_plus = False
+ self.escape_dash = False
+ self.escape_snob = False
+
+ def update_params(self, **kwargs):
+ """Update parameters and set preserved tags."""
+ for key, value in kwargs.items():
+ if key == 'preserve_tags':
+ self.preserve_tags = set(value)
+ elif key == 'handle_code_in_pre':
+ self.handle_code_in_pre = value
+ else:
+ setattr(self, key, value)
+
+ def handle_tag(self, tag, attrs, start):
+ # Handle preserved tags
+ if tag in self.preserve_tags:
+ if start:
+ if self.preserve_depth == 0:
+ self.current_preserved_tag = tag
+ self.preserved_content = []
+ # Format opening tag with attributes
+ attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
+ self.preserved_content.append(f'<{tag}{attr_str}>')
+ self.preserve_depth += 1
+ return
+ else:
+ self.preserve_depth -= 1
+ if self.preserve_depth == 0:
+ self.preserved_content.append(f'{tag}>')
+ # Output the preserved HTML block with proper spacing
+ preserved_html = ''.join(self.preserved_content)
+ self.o('\n' + preserved_html + '\n')
+ self.current_preserved_tag = None
+ return
+
+ # If we're inside a preserved tag, collect all content
+ if self.preserve_depth > 0:
+ if start:
+ # Format nested tags with attributes
+ attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
+ self.preserved_content.append(f'<{tag}{attr_str}>')
+ else:
+ self.preserved_content.append(f'{tag}>')
+ return
+
+ # Handle pre tags
+ if tag == 'pre':
+ if start:
+ self.o('```\n') # Markdown code block start
+ self.inside_pre = True
+ else:
+ self.o('\n```\n') # Markdown code block end
+ self.inside_pre = False
+ elif tag == 'code':
+ if self.inside_pre and not self.handle_code_in_pre:
+ # Ignore code tags inside pre blocks if handle_code_in_pre is False
+ return
+ if start:
+ self.o('`') # Markdown inline code start
+ self.inside_code = True
+ else:
+ self.o('`') # Markdown inline code end
+ self.inside_code = False
+ else:
+ super().handle_tag(tag, attrs, start)
+
+ def handle_data(self, data, entity_char=False):
+ """Override handle_data to capture content within preserved tags."""
+ if self.preserve_depth > 0:
+ self.preserved_content.append(data)
+ return
+
+ if self.inside_pre:
+ # Output the raw content for pre blocks, including content inside code tags
+ self.o(data) # Directly output the data as-is (preserve newlines)
+ return
+ if self.inside_code:
+ # Inline code: no newlines allowed
+ self.o(data.replace('\n', ' '))
+ return
+
+ # Default behavior for other tags
+ super().handle_data(data, entity_char)
+
+
+ # # Handle pre tags
+ # if tag == 'pre':
+ # if start:
+ # self.o('```\n')
+ # self.inside_pre = True
+ # else:
+ # self.o('\n```')
+ # self.inside_pre = False
+ # # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
+ # # pass
+ # else:
+ # super().handle_tag(tag, attrs, start)
+
+ # def handle_data(self, data, entity_char=False):
+ # """Override handle_data to capture content within preserved tags."""
+ # if self.preserve_depth > 0:
+ # self.preserved_content.append(data)
+ # return
+ # super().handle_data(data, entity_char)
diff --git a/crawl4ai/html2text/__main__.py b/crawl4ai/html2text/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e28416e104515e90fca4b69cc60d0c61fd15d61
--- /dev/null
+++ b/crawl4ai/html2text/__main__.py
@@ -0,0 +1,3 @@
+from .cli import main
+
+main()
diff --git a/crawl4ai/html2text/_typing.py b/crawl4ai/html2text/_typing.py
new file mode 100644
index 0000000000000000000000000000000000000000..eed83251cd381e68c0c5062ac3a50b97fbc3a483
--- /dev/null
+++ b/crawl4ai/html2text/_typing.py
@@ -0,0 +1,2 @@
+class OutCallback:
+ def __call__(self, s: str) -> None: ...
diff --git a/crawl4ai/html2text/cli.py b/crawl4ai/html2text/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..015322743d7bebb535b105d493cd6d23da64f303
--- /dev/null
+++ b/crawl4ai/html2text/cli.py
@@ -0,0 +1,330 @@
+import argparse
+import sys
+
+from . import HTML2Text, __version__, config
+
+
+def main() -> None:
+ baseurl = ""
+
+ class bcolors:
+ HEADER = "\033[95m"
+ OKBLUE = "\033[94m"
+ OKGREEN = "\033[92m"
+ WARNING = "\033[93m"
+ FAIL = "\033[91m"
+ ENDC = "\033[0m"
+ BOLD = "\033[1m"
+ UNDERLINE = "\033[4m"
+
+ p = argparse.ArgumentParser()
+ p.add_argument(
+ "--default-image-alt",
+ dest="default_image_alt",
+ default=config.DEFAULT_IMAGE_ALT,
+ help="The default alt string for images with missing ones",
+ )
+ p.add_argument(
+ "--pad-tables",
+ dest="pad_tables",
+ action="store_true",
+ default=config.PAD_TABLES,
+ help="pad the cells to equal column width in tables",
+ )
+ p.add_argument(
+ "--no-wrap-links",
+ dest="wrap_links",
+ action="store_false",
+ default=config.WRAP_LINKS,
+ help="don't wrap links during conversion",
+ )
+ p.add_argument(
+ "--wrap-list-items",
+ dest="wrap_list_items",
+ action="store_true",
+ default=config.WRAP_LIST_ITEMS,
+ help="wrap list items during conversion",
+ )
+ p.add_argument(
+ "--wrap-tables",
+ dest="wrap_tables",
+ action="store_true",
+ default=config.WRAP_TABLES,
+ help="wrap tables",
+ )
+ p.add_argument(
+ "--ignore-emphasis",
+ dest="ignore_emphasis",
+ action="store_true",
+ default=config.IGNORE_EMPHASIS,
+ help="don't include any formatting for emphasis",
+ )
+ p.add_argument(
+ "--reference-links",
+ dest="inline_links",
+ action="store_false",
+ default=config.INLINE_LINKS,
+ help="use reference style links instead of inline links",
+ )
+ p.add_argument(
+ "--ignore-links",
+ dest="ignore_links",
+ action="store_true",
+ default=config.IGNORE_ANCHORS,
+ help="don't include any formatting for links",
+ )
+ p.add_argument(
+ "--ignore-mailto-links",
+ action="store_true",
+ dest="ignore_mailto_links",
+ default=config.IGNORE_MAILTO_LINKS,
+ help="don't include mailto: links",
+ )
+ p.add_argument(
+ "--protect-links",
+ dest="protect_links",
+ action="store_true",
+ default=config.PROTECT_LINKS,
+ help="protect links from line breaks surrounding them with angle brackets",
+ )
+ p.add_argument(
+ "--ignore-images",
+ dest="ignore_images",
+ action="store_true",
+ default=config.IGNORE_IMAGES,
+ help="don't include any formatting for images",
+ )
+ p.add_argument(
+ "--images-as-html",
+ dest="images_as_html",
+ action="store_true",
+ default=config.IMAGES_AS_HTML,
+ help=(
+ "Always write image tags as raw html; preserves `height`, `width` and "
+ "`alt` if possible."
+ ),
+ )
+ p.add_argument(
+ "--images-to-alt",
+ dest="images_to_alt",
+ action="store_true",
+ default=config.IMAGES_TO_ALT,
+ help="Discard image data, only keep alt text",
+ )
+ p.add_argument(
+ "--images-with-size",
+ dest="images_with_size",
+ action="store_true",
+ default=config.IMAGES_WITH_SIZE,
+ help=(
+ "Write image tags with height and width attrs as raw html to retain "
+ "dimensions"
+ ),
+ )
+ p.add_argument(
+ "-g",
+ "--google-doc",
+ action="store_true",
+ dest="google_doc",
+ default=False,
+ help="convert an html-exported Google Document",
+ )
+ p.add_argument(
+ "-d",
+ "--dash-unordered-list",
+ action="store_true",
+ dest="ul_style_dash",
+ default=False,
+ help="use a dash rather than a star for unordered list items",
+ )
+ p.add_argument(
+ "-e",
+ "--asterisk-emphasis",
+ action="store_true",
+ dest="em_style_asterisk",
+ default=False,
+ help="use an asterisk rather than an underscore for emphasized text",
+ )
+ p.add_argument(
+ "-b",
+ "--body-width",
+ dest="body_width",
+ type=int,
+ default=config.BODY_WIDTH,
+ help="number of characters per output line, 0 for no wrap",
+ )
+ p.add_argument(
+ "-i",
+ "--google-list-indent",
+ dest="list_indent",
+ type=int,
+ default=config.GOOGLE_LIST_INDENT,
+ help="number of pixels Google indents nested lists",
+ )
+ p.add_argument(
+ "-s",
+ "--hide-strikethrough",
+ action="store_true",
+ dest="hide_strikethrough",
+ default=False,
+ help="hide strike-through text. only relevant when -g is " "specified as well",
+ )
+ p.add_argument(
+ "--escape-all",
+ action="store_true",
+ dest="escape_snob",
+ default=False,
+ help=(
+ "Escape all special characters. Output is less readable, but avoids "
+ "corner case formatting issues."
+ ),
+ )
+ p.add_argument(
+ "--bypass-tables",
+ action="store_true",
+ dest="bypass_tables",
+ default=config.BYPASS_TABLES,
+ help="Format tables in HTML rather than Markdown syntax.",
+ )
+ p.add_argument(
+ "--ignore-tables",
+ action="store_true",
+ dest="ignore_tables",
+ default=config.IGNORE_TABLES,
+ help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
+ )
+ p.add_argument(
+ "--single-line-break",
+ action="store_true",
+ dest="single_line_break",
+ default=config.SINGLE_LINE_BREAK,
+ help=(
+ "Use a single line break after a block element rather than two line "
+ "breaks. NOTE: Requires --body-width=0"
+ ),
+ )
+ p.add_argument(
+ "--unicode-snob",
+ action="store_true",
+ dest="unicode_snob",
+ default=config.UNICODE_SNOB,
+ help="Use unicode throughout document",
+ )
+ p.add_argument(
+ "--no-automatic-links",
+ action="store_false",
+ dest="use_automatic_links",
+ default=config.USE_AUTOMATIC_LINKS,
+ help="Do not use automatic links wherever applicable",
+ )
+ p.add_argument(
+ "--no-skip-internal-links",
+ action="store_false",
+ dest="skip_internal_links",
+ default=config.SKIP_INTERNAL_LINKS,
+ help="Do not skip internal links",
+ )
+ p.add_argument(
+ "--links-after-para",
+ action="store_true",
+ dest="links_each_paragraph",
+ default=config.LINKS_EACH_PARAGRAPH,
+ help="Put links after each paragraph instead of document",
+ )
+ p.add_argument(
+ "--mark-code",
+ action="store_true",
+ dest="mark_code",
+ default=config.MARK_CODE,
+ help="Mark program code blocks with [code]...[/code]",
+ )
+ p.add_argument(
+ "--decode-errors",
+ dest="decode_errors",
+ default=config.DECODE_ERRORS,
+ help=(
+ "What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
+ "acceptable values"
+ ),
+ )
+ p.add_argument(
+ "--open-quote",
+ dest="open_quote",
+ default=config.OPEN_QUOTE,
+ help="The character used to open quotes",
+ )
+ p.add_argument(
+ "--close-quote",
+ dest="close_quote",
+ default=config.CLOSE_QUOTE,
+ help="The character used to close quotes",
+ )
+ p.add_argument(
+ "--version", action="version", version=".".join(map(str, __version__))
+ )
+ p.add_argument("filename", nargs="?")
+ p.add_argument("encoding", nargs="?", default="utf-8")
+ p.add_argument(
+ "--include-sup-sub",
+ dest="include_sup_sub",
+ action="store_true",
+ default=config.INCLUDE_SUP_SUB,
+ help="Include the sup and sub tags",
+ )
+ args = p.parse_args()
+
+ if args.filename and args.filename != "-":
+ with open(args.filename, "rb") as fp:
+ data = fp.read()
+ else:
+ data = sys.stdin.buffer.read()
+
+ try:
+ html = data.decode(args.encoding, args.decode_errors)
+ except UnicodeDecodeError as err:
+ warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
+ warning += " Use the " + bcolors.OKGREEN
+ warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
+ print(warning)
+ raise err
+
+ h = HTML2Text(baseurl=baseurl)
+ # handle options
+ if args.ul_style_dash:
+ h.ul_item_mark = "-"
+ if args.em_style_asterisk:
+ h.emphasis_mark = "*"
+ h.strong_mark = "__"
+
+ h.body_width = args.body_width
+ h.google_list_indent = args.list_indent
+ h.ignore_emphasis = args.ignore_emphasis
+ h.ignore_links = args.ignore_links
+ h.ignore_mailto_links = args.ignore_mailto_links
+ h.protect_links = args.protect_links
+ h.ignore_images = args.ignore_images
+ h.images_as_html = args.images_as_html
+ h.images_to_alt = args.images_to_alt
+ h.images_with_size = args.images_with_size
+ h.google_doc = args.google_doc
+ h.hide_strikethrough = args.hide_strikethrough
+ h.escape_snob = args.escape_snob
+ h.bypass_tables = args.bypass_tables
+ h.ignore_tables = args.ignore_tables
+ h.single_line_break = args.single_line_break
+ h.inline_links = args.inline_links
+ h.unicode_snob = args.unicode_snob
+ h.use_automatic_links = args.use_automatic_links
+ h.skip_internal_links = args.skip_internal_links
+ h.links_each_paragraph = args.links_each_paragraph
+ h.mark_code = args.mark_code
+ h.wrap_links = args.wrap_links
+ h.wrap_list_items = args.wrap_list_items
+ h.wrap_tables = args.wrap_tables
+ h.pad_tables = args.pad_tables
+ h.default_image_alt = args.default_image_alt
+ h.open_quote = args.open_quote
+ h.close_quote = args.close_quote
+ h.include_sup_sub = args.include_sup_sub
+
+ sys.stdout.write(h.handle(html))
diff --git a/crawl4ai/html2text/config.py b/crawl4ai/html2text/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d14ed64f90772ea9a3e92cc850b659f6f31756f0
--- /dev/null
+++ b/crawl4ai/html2text/config.py
@@ -0,0 +1,172 @@
+import re
+
+# Use Unicode characters instead of their ascii pseudo-replacements
+UNICODE_SNOB = False
+
+# Marker to use for marking tables for padding post processing
+TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
+# Escape all special characters. Output is less readable, but avoids
+# corner case formatting issues.
+ESCAPE_SNOB = False
+ESCAPE_BACKSLASH = False
+ESCAPE_DOT = False
+ESCAPE_PLUS = False
+ESCAPE_DASH = False
+
+# Put the links after each paragraph instead of at the end.
+LINKS_EACH_PARAGRAPH = False
+
+# Wrap long lines at position. 0 for no wrapping.
+BODY_WIDTH = 78
+
+# Don't show internal links (href="#local-anchor") -- corresponding link
+# targets won't be visible in the plain text file anyway.
+SKIP_INTERNAL_LINKS = True
+
+# Use inline, rather than reference, formatting for images and links
+INLINE_LINKS = True
+
+# Protect links from line breaks surrounding them with angle brackets (in
+# addition to their square brackets)
+PROTECT_LINKS = False
+# WRAP_LINKS = True
+WRAP_LINKS = True
+
+# Wrap list items.
+WRAP_LIST_ITEMS = False
+
+# Wrap tables
+WRAP_TABLES = False
+
+# Number of pixels Google indents nested lists
+GOOGLE_LIST_INDENT = 36
+
+# Values Google and others may use to indicate bold text
+BOLD_TEXT_STYLE_VALUES = ("bold", "700", "800", "900")
+
+IGNORE_ANCHORS = False
+IGNORE_MAILTO_LINKS = False
+IGNORE_IMAGES = False
+IMAGES_AS_HTML = False
+IMAGES_TO_ALT = False
+IMAGES_WITH_SIZE = False
+IGNORE_EMPHASIS = False
+MARK_CODE = False
+DECODE_ERRORS = "strict"
+DEFAULT_IMAGE_ALT = ""
+PAD_TABLES = False
+
+# Convert links with same href and text to format
+# if they are absolute links
+USE_AUTOMATIC_LINKS = True
+
+# For checking space-only lines on line 771
+RE_SPACE = re.compile(r"\s\+")
+
+RE_ORDERED_LIST_MATCHER = re.compile(r"\d+\.\s")
+RE_UNORDERED_LIST_MATCHER = re.compile(r"[-\*\+]\s")
+RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
+RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
+
+# to find links in the text
+RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
+
+# to find table separators
+RE_TABLE = re.compile(r" \| ")
+
+RE_MD_DOT_MATCHER = re.compile(
+ r"""
+ ^ # start of line
+ (\s*\d+) # optional whitespace and a number
+ (\.) # dot
+ (?=\s) # lookahead assert whitespace
+ """,
+ re.MULTILINE | re.VERBOSE,
+)
+RE_MD_PLUS_MATCHER = re.compile(
+ r"""
+ ^
+ (\s*)
+ (\+)
+ (?=\s)
+ """,
+ flags=re.MULTILINE | re.VERBOSE,
+)
+RE_MD_DASH_MATCHER = re.compile(
+ r"""
+ ^
+ (\s*)
+ (-)
+ (?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
+ # or another dash (header or hr)
+ """,
+ flags=re.MULTILINE | re.VERBOSE,
+)
+RE_SLASH_CHARS = r"\`*_{}[]()#+-.!"
+RE_MD_BACKSLASH_MATCHER = re.compile(
+ r"""
+ (\\) # match one slash
+ (?=[%s]) # followed by a char that requires escaping
+ """
+ % re.escape(RE_SLASH_CHARS),
+ flags=re.VERBOSE,
+)
+
+UNIFIABLE = {
+ "rsquo": "'",
+ "lsquo": "'",
+ "rdquo": '"',
+ "ldquo": '"',
+ "copy": "(C)",
+ "mdash": "--",
+ "nbsp": " ",
+ "rarr": "->",
+ "larr": "<-",
+ "middot": "*",
+ "ndash": "-",
+ "oelig": "oe",
+ "aelig": "ae",
+ "agrave": "a",
+ "aacute": "a",
+ "acirc": "a",
+ "atilde": "a",
+ "auml": "a",
+ "aring": "a",
+ "egrave": "e",
+ "eacute": "e",
+ "ecirc": "e",
+ "euml": "e",
+ "igrave": "i",
+ "iacute": "i",
+ "icirc": "i",
+ "iuml": "i",
+ "ograve": "o",
+ "oacute": "o",
+ "ocirc": "o",
+ "otilde": "o",
+ "ouml": "o",
+ "ugrave": "u",
+ "uacute": "u",
+ "ucirc": "u",
+ "uuml": "u",
+ "lrm": "",
+ "rlm": "",
+}
+
+# Format tables in HTML rather than Markdown syntax
+BYPASS_TABLES = False
+# Ignore table-related tags (table, th, td, tr) while keeping rows
+IGNORE_TABLES = False
+
+
+# Use a single line break after a block element rather than two line breaks.
+# NOTE: Requires body width setting to be 0.
+SINGLE_LINE_BREAK = False
+
+
+# Use double quotation marks when converting the tag.
+OPEN_QUOTE = '"'
+CLOSE_QUOTE = '"'
+
+# Include the and tags
+INCLUDE_SUP_SUB = False
diff --git a/crawl4ai/html2text/elements.py b/crawl4ai/html2text/elements.py
new file mode 100644
index 0000000000000000000000000000000000000000..2533ec084e664f6c4cd19adb175325de0c844d55
--- /dev/null
+++ b/crawl4ai/html2text/elements.py
@@ -0,0 +1,18 @@
+from typing import Dict, Optional
+
+
+class AnchorElement:
+ __slots__ = ["attrs", "count", "outcount"]
+
+ def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount: int):
+ self.attrs = attrs
+ self.count = count
+ self.outcount = outcount
+
+
+class ListElement:
+ __slots__ = ["name", "num"]
+
+ def __init__(self, name: str, num: int):
+ self.name = name
+ self.num = num
diff --git a/crawl4ai/html2text/utils.py b/crawl4ai/html2text/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1909d2cf754b57c8d1fba112a1a1eb3af81a8d3b
--- /dev/null
+++ b/crawl4ai/html2text/utils.py
@@ -0,0 +1,303 @@
+import html.entities
+from typing import Dict, List, Optional
+
+from . import config
+
+unifiable_n = {
+ html.entities.name2codepoint[k]: v
+ for k, v in config.UNIFIABLE.items()
+ if k != "nbsp"
+}
+
+
+def hn(tag: str) -> int:
+ if tag[0] == "h" and len(tag) == 2:
+ n = tag[1]
+ if "0" < n <= "9":
+ return int(n)
+ return 0
+
+
+def dumb_property_dict(style: str) -> Dict[str, str]:
+ """
+ :returns: A hash of css attributes
+ """
+ return {
+ x.strip().lower(): y.strip().lower()
+ for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
+ }
+
+
+def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
+ """
+ :type data: str
+
+ :returns: A hash of css selectors, each of which contains a hash of
+ css attributes.
+ :rtype: dict
+ """
+ # remove @import sentences
+ data += ";"
+ importIndex = data.find("@import")
+ while importIndex != -1:
+ data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
+ importIndex = data.find("@import")
+
+ # parse the css. reverted from dictionary comprehension in order to
+ # support older pythons
+ pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
+ try:
+ elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
+ except ValueError:
+ elements = {} # not that important
+
+ return elements
+
+
+def element_style(
+ attrs: Dict[str, Optional[str]],
+ style_def: Dict[str, Dict[str, str]],
+ parent_style: Dict[str, str],
+) -> Dict[str, str]:
+ """
+ :type attrs: dict
+ :type style_def: dict
+ :type style_def: dict
+
+ :returns: A hash of the 'final' style attributes of the element
+ :rtype: dict
+ """
+ style = parent_style.copy()
+ if "class" in attrs:
+ assert attrs["class"] is not None
+ for css_class in attrs["class"].split():
+ css_style = style_def.get("." + css_class, {})
+ style.update(css_style)
+ if "style" in attrs:
+ assert attrs["style"] is not None
+ immediate_style = dumb_property_dict(attrs["style"])
+ style.update(immediate_style)
+
+ return style
+
+
+def google_list_style(style: Dict[str, str]) -> str:
+ """
+ Finds out whether this is an ordered or unordered list
+
+ :type style: dict
+
+ :rtype: str
+ """
+ if "list-style-type" in style:
+ list_style = style["list-style-type"]
+ if list_style in ["disc", "circle", "square", "none"]:
+ return "ul"
+
+ return "ol"
+
+
+def google_has_height(style: Dict[str, str]) -> bool:
+ """
+ Check if the style of the element has the 'height' attribute
+ explicitly defined
+
+ :type style: dict
+
+ :rtype: bool
+ """
+ return "height" in style
+
+
+def google_text_emphasis(style: Dict[str, str]) -> List[str]:
+ """
+ :type style: dict
+
+ :returns: A list of all emphasis modifiers of the element
+ :rtype: list
+ """
+ emphasis = []
+ if "text-decoration" in style:
+ emphasis.append(style["text-decoration"])
+ if "font-style" in style:
+ emphasis.append(style["font-style"])
+ if "font-weight" in style:
+ emphasis.append(style["font-weight"])
+
+ return emphasis
+
+
+def google_fixed_width_font(style: Dict[str, str]) -> bool:
+ """
+ Check if the css of the current element defines a fixed width font
+
+ :type style: dict
+
+ :rtype: bool
+ """
+ font_family = ""
+ if "font-family" in style:
+ font_family = style["font-family"]
+ return "courier new" == font_family or "consolas" == font_family
+
+
+def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
+ """
+ Extract numbering from list element attributes
+
+ :type attrs: dict
+
+ :rtype: int or None
+ """
+ if "start" in attrs:
+ assert attrs["start"] is not None
+ try:
+ return int(attrs["start"]) - 1
+ except ValueError:
+ pass
+
+ return 0
+
+
+def skipwrap(
+ para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
+) -> bool:
+ # If it appears to contain a link
+ # don't wrap
+ if not wrap_links and config.RE_LINK.search(para):
+ return True
+ # If the text begins with four spaces or one tab, it's a code block;
+ # don't wrap
+ if para[0:4] == " " or para[0] == "\t":
+ return True
+
+ # If the text begins with only two "--", possibly preceded by
+ # whitespace, that's an emdash; so wrap.
+ stripped = para.lstrip()
+ if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
+ return False
+
+ # I'm not sure what this is for; I thought it was to detect lists,
+ # but there's a -inside- case in one of the tests that
+ # also depends upon it.
+ if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":
+ return not wrap_list_items
+
+ # If text contains a pipe character it is likely a table
+ if not wrap_tables and config.RE_TABLE.search(para):
+ return True
+
+ # If the text begins with a single -, *, or +, followed by a space,
+ # or an integer, followed by a ., followed by a space (in either
+ # case optionally proceeded by whitespace), it's a list; don't wrap.
+ return bool(
+ config.RE_ORDERED_LIST_MATCHER.match(stripped)
+ or config.RE_UNORDERED_LIST_MATCHER.match(stripped)
+ )
+
+
+def escape_md(text: str) -> str:
+ """
+ Escapes markdown-sensitive characters within other markdown
+ constructs.
+ """
+ return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
+
+
+def escape_md_section(
+ text: str,
+ escape_backslash: bool = True,
+ snob: bool = False,
+ escape_dot: bool = True,
+ escape_plus: bool = True,
+ escape_dash: bool = True
+) -> str:
+ """
+ Escapes markdown-sensitive characters across whole document sections.
+ Each escaping operation can be controlled individually.
+ """
+ if escape_backslash:
+ text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
+
+ if snob:
+ text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
+
+ if escape_dot:
+ text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
+
+ if escape_plus:
+ text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
+
+ if escape_dash:
+ text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
+
+ return text
+
+def reformat_table(lines: List[str], right_margin: int) -> List[str]:
+ """
+ Given the lines of a table
+ padds the cells and returns the new lines
+ """
+ # find the maximum width of the columns
+ max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")]
+ max_cols = len(max_width)
+ for line in lines:
+ cols = [x.rstrip() for x in line.split("|")]
+ num_cols = len(cols)
+
+ # don't drop any data if colspan attributes result in unequal lengths
+ if num_cols < max_cols:
+ cols += [""] * (max_cols - num_cols)
+ elif max_cols < num_cols:
+ max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]]
+ max_cols = num_cols
+
+ max_width = [
+ max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)
+ ]
+
+ # reformat
+ new_lines = []
+ for line in lines:
+ cols = [x.rstrip() for x in line.split("|")]
+ if set(line.strip()) == set("-|"):
+ filler = "-"
+ new_cols = [
+ x.rstrip() + (filler * (M - len(x.rstrip())))
+ for x, M in zip(cols, max_width)
+ ]
+ new_lines.append("|-" + "|".join(new_cols) + "|")
+ else:
+ filler = " "
+ new_cols = [
+ x.rstrip() + (filler * (M - len(x.rstrip())))
+ for x, M in zip(cols, max_width)
+ ]
+ new_lines.append("| " + "|".join(new_cols) + "|")
+ return new_lines
+
+
+def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
+ """
+ Provide padding for tables in the text
+ """
+ lines = text.split("\n")
+ table_buffer = [] # type: List[str]
+ table_started = False
+ new_lines = []
+ for line in lines:
+ # Toggle table started
+ if config.TABLE_MARKER_FOR_PAD in line:
+ table_started = not table_started
+ if not table_started:
+ table = reformat_table(table_buffer, right_margin)
+ new_lines.extend(table)
+ table_buffer = []
+ new_lines.append("")
+ continue
+ # Process lines
+ if table_started:
+ table_buffer.append(line)
+ else:
+ new_lines.append(line)
+ return "\n".join(new_lines)
diff --git a/crawl4ai/install.py b/crawl4ai/install.py
new file mode 100644
index 0000000000000000000000000000000000000000..7efb6800b1d7eb9a9edf5fea639f92c982fbe7b8
--- /dev/null
+++ b/crawl4ai/install.py
@@ -0,0 +1,83 @@
+import subprocess
+import sys
+import asyncio
+from .async_logger import AsyncLogger, LogLevel
+
+# Initialize logger
+logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
+
+def post_install():
+ """Run all post-installation tasks"""
+ logger.info("Running post-installation setup...", tag="INIT")
+ install_playwright()
+ run_migration()
+ logger.success("Post-installation setup completed!", tag="COMPLETE")
+
+def install_playwright():
+ logger.info("Installing Playwright browsers...", tag="INIT")
+ try:
+ # subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chrome"])
+ subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chromium"])
+ logger.success("Playwright installation completed successfully.", tag="COMPLETE")
+ except subprocess.CalledProcessError as e:
+ # logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
+ logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
+ except Exception as e:
+ # logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
+ logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
+
+def run_migration():
+ """Initialize database during installation"""
+ try:
+ logger.info("Starting database initialization...", tag="INIT")
+ from crawl4ai.async_database import async_db_manager
+
+ asyncio.run(async_db_manager.initialize())
+ logger.success("Database initialization completed successfully.", tag="COMPLETE")
+ except ImportError:
+ logger.warning("Database module not found. Will initialize on first use.")
+ except Exception as e:
+ logger.warning(f"Database initialization failed: {e}")
+ logger.warning("Database will be initialized on first use")
+
+async def run_doctor():
+ """Test if Crawl4AI is working properly"""
+ logger.info("Running Crawl4AI health check...", tag="INIT")
+ try:
+ from .async_webcrawler import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+ browser_config = BrowserConfig(
+ headless=True,
+ browser_type="chromium",
+ ignore_https_errors=True,
+ light_mode=True,
+ viewport_width=1280,
+ viewport_height=720
+ )
+
+ run_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ screenshot=True,
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ logger.info("Testing crawling capabilities...", tag="TEST")
+ result = await crawler.arun(
+ url="https://crawl4ai.com",
+ config=run_config
+ )
+
+ if result and result.markdown:
+ logger.success("✅ Crawling test passed!", tag="COMPLETE")
+ return True
+ else:
+ raise Exception("Failed to get content")
+
+ except Exception as e:
+ logger.error(f"❌ Test failed: {e}", tag="ERROR")
+ return False
+
+def doctor():
+ """Entry point for the doctor command"""
+ import asyncio
+ return asyncio.run(run_doctor())
diff --git a/crawl4ai/js_snippet/__init__.py b/crawl4ai/js_snippet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73b0c2dd343d45483a5783ecc8d05fa459af9cd6
--- /dev/null
+++ b/crawl4ai/js_snippet/__init__.py
@@ -0,0 +1,15 @@
+import os, sys
+
+# Create a function get name of a js script, then load from the CURRENT folder of this script and return its content as string, make sure its error free
+def load_js_script(script_name):
+ # Get the path of the current script
+ current_script_path = os.path.dirname(os.path.realpath(__file__))
+ # Get the path of the script to load
+ script_path = os.path.join(current_script_path, script_name + '.js')
+ # Check if the script exists
+ if not os.path.exists(script_path):
+ raise ValueError(f"Script {script_name} not found in the folder {current_script_path}")
+ # Load the content of the script
+ with open(script_path, 'r') as f:
+ script_content = f.read()
+ return script_content
diff --git a/crawl4ai/js_snippet/navigator_overrider.js b/crawl4ai/js_snippet/navigator_overrider.js
new file mode 100644
index 0000000000000000000000000000000000000000..f341ceeb743bfaea669a7bdf378844586f52c5f2
--- /dev/null
+++ b/crawl4ai/js_snippet/navigator_overrider.js
@@ -0,0 +1,25 @@
+// Pass the Permissions Test.
+const originalQuery = window.navigator.permissions.query;
+window.navigator.permissions.query = (parameters) =>
+ parameters.name === "notifications"
+ ? Promise.resolve({ state: Notification.permission })
+ : originalQuery(parameters);
+Object.defineProperty(navigator, "webdriver", {
+ get: () => undefined,
+});
+window.navigator.chrome = {
+ runtime: {},
+ // Add other properties if necessary
+};
+Object.defineProperty(navigator, "plugins", {
+ get: () => [1, 2, 3, 4, 5],
+});
+Object.defineProperty(navigator, "languages", {
+ get: () => ["en-US", "en"],
+});
+Object.defineProperty(document, "hidden", {
+ get: () => false,
+});
+Object.defineProperty(document, "visibilityState", {
+ get: () => "visible",
+});
diff --git a/crawl4ai/js_snippet/remove_overlay_elements.js b/crawl4ai/js_snippet/remove_overlay_elements.js
new file mode 100644
index 0000000000000000000000000000000000000000..0400d89c40a9c0206ddf6d8110d0c2939a29af8e
--- /dev/null
+++ b/crawl4ai/js_snippet/remove_overlay_elements.js
@@ -0,0 +1,119 @@
+async () => {
+ // Function to check if element is visible
+ const isVisible = (elem) => {
+ const style = window.getComputedStyle(elem);
+ return style.display !== "none" && style.visibility !== "hidden" && style.opacity !== "0";
+ };
+
+ // Common selectors for popups and overlays
+ const commonSelectors = [
+ // Close buttons first
+ 'button[class*="close" i]',
+ 'button[class*="dismiss" i]',
+ 'button[aria-label*="close" i]',
+ 'button[title*="close" i]',
+ 'a[class*="close" i]',
+ 'span[class*="close" i]',
+
+ // Cookie notices
+ '[class*="cookie-banner" i]',
+ '[id*="cookie-banner" i]',
+ '[class*="cookie-consent" i]',
+ '[id*="cookie-consent" i]',
+
+ // Newsletter/subscription dialogs
+ '[class*="newsletter" i]',
+ '[class*="subscribe" i]',
+
+ // Generic popups/modals
+ '[class*="popup" i]',
+ '[class*="modal" i]',
+ '[class*="overlay" i]',
+ '[class*="dialog" i]',
+ '[role="dialog"]',
+ '[role="alertdialog"]',
+ ];
+
+ // Try to click close buttons first
+ for (const selector of commonSelectors.slice(0, 6)) {
+ const closeButtons = document.querySelectorAll(selector);
+ for (const button of closeButtons) {
+ if (isVisible(button)) {
+ try {
+ button.click();
+ await new Promise((resolve) => setTimeout(resolve, 100));
+ } catch (e) {
+ console.log("Error clicking button:", e);
+ }
+ }
+ }
+ }
+
+ // Remove remaining overlay elements
+ const removeOverlays = () => {
+ // Find elements with high z-index
+ const allElements = document.querySelectorAll("*");
+ for (const elem of allElements) {
+ const style = window.getComputedStyle(elem);
+ const zIndex = parseInt(style.zIndex);
+ const position = style.position;
+
+ if (
+ isVisible(elem) &&
+ (zIndex > 999 || position === "fixed" || position === "absolute") &&
+ (elem.offsetWidth > window.innerWidth * 0.5 ||
+ elem.offsetHeight > window.innerHeight * 0.5 ||
+ style.backgroundColor.includes("rgba") ||
+ parseFloat(style.opacity) < 1)
+ ) {
+ elem.remove();
+ }
+ }
+
+ // Remove elements matching common selectors
+ for (const selector of commonSelectors) {
+ const elements = document.querySelectorAll(selector);
+ elements.forEach((elem) => {
+ if (isVisible(elem)) {
+ elem.remove();
+ }
+ });
+ }
+ };
+
+ // Remove overlay elements
+ removeOverlays();
+
+ // Remove any fixed/sticky position elements at the top/bottom
+ const removeFixedElements = () => {
+ const elements = document.querySelectorAll("*");
+ elements.forEach((elem) => {
+ const style = window.getComputedStyle(elem);
+ if ((style.position === "fixed" || style.position === "sticky") && isVisible(elem)) {
+ elem.remove();
+ }
+ });
+ };
+
+ removeFixedElements();
+
+ // Remove empty block elements as: div, p, span, etc.
+ const removeEmptyBlockElements = () => {
+ const blockElements = document.querySelectorAll(
+ "div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6"
+ );
+ blockElements.forEach((elem) => {
+ if (elem.innerText.trim() === "") {
+ elem.remove();
+ }
+ });
+ };
+
+ // Remove margin-right and padding-right from body (often added by modal scripts)
+ document.body.style.marginRight = "0px";
+ document.body.style.paddingRight = "0px";
+ document.body.style.overflow = "auto";
+
+ // Wait a bit for any animations to complete
+ await new Promise((resolve) => setTimeout(resolve, 100));
+};
diff --git a/crawl4ai/js_snippet/update_image_dimensions.js b/crawl4ai/js_snippet/update_image_dimensions.js
new file mode 100644
index 0000000000000000000000000000000000000000..709a35d5143227718ef2a5c29385f1346af4de40
--- /dev/null
+++ b/crawl4ai/js_snippet/update_image_dimensions.js
@@ -0,0 +1,54 @@
+() => {
+ return new Promise((resolve) => {
+ const filterImage = (img) => {
+ // Filter out images that are too small
+ if (img.width < 100 && img.height < 100) return false;
+
+ // Filter out images that are not visible
+ const rect = img.getBoundingClientRect();
+ if (rect.width === 0 || rect.height === 0) return false;
+
+ // Filter out images with certain class names (e.g., icons, thumbnails)
+ if (img.classList.contains("icon") || img.classList.contains("thumbnail")) return false;
+
+ // Filter out images with certain patterns in their src (e.g., placeholder images)
+ if (img.src.includes("placeholder") || img.src.includes("icon")) return false;
+
+ return true;
+ };
+
+ const images = Array.from(document.querySelectorAll("img")).filter(filterImage);
+ let imagesLeft = images.length;
+
+ if (imagesLeft === 0) {
+ resolve();
+ return;
+ }
+
+ const checkImage = (img) => {
+ if (img.complete && img.naturalWidth !== 0) {
+ img.setAttribute("width", img.naturalWidth);
+ img.setAttribute("height", img.naturalHeight);
+ imagesLeft--;
+ if (imagesLeft === 0) resolve();
+ }
+ };
+
+ images.forEach((img) => {
+ checkImage(img);
+ if (!img.complete) {
+ img.onload = () => {
+ checkImage(img);
+ };
+ img.onerror = () => {
+ imagesLeft--;
+ if (imagesLeft === 0) resolve();
+ };
+ }
+ });
+
+ // Fallback timeout of 5 seconds
+ // setTimeout(() => resolve(), 5000);
+ resolve();
+ });
+};
diff --git a/crawl4ai/llmtxt.py b/crawl4ai/llmtxt.py
new file mode 100644
index 0000000000000000000000000000000000000000..94efe0767995af580e2f75c9b6a13f4be0f8d811
--- /dev/null
+++ b/crawl4ai/llmtxt.py
@@ -0,0 +1,498 @@
+import os
+from pathlib import Path
+import re
+from typing import Dict, List, Tuple, Optional, Any
+import json
+from tqdm import tqdm
+import time
+import psutil
+import numpy as np
+from rank_bm25 import BM25Okapi
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from litellm import completion, batch_completion
+from .async_logger import AsyncLogger
+import litellm
+import pickle
+import hashlib # <--- ADDED for file-hash
+from fnmatch import fnmatch
+import glob
+
+litellm.set_verbose = False
+
+def _compute_file_hash(file_path: Path) -> str:
+ """Compute MD5 hash for the file's entire content."""
+ hash_md5 = hashlib.md5()
+ with file_path.open("rb") as f:
+ for chunk in iter(lambda: f.read(4096), b""):
+ hash_md5.update(chunk)
+ return hash_md5.hexdigest()
+
+class AsyncLLMTextManager:
+ def __init__(
+ self,
+ docs_dir: Path,
+ logger: Optional[AsyncLogger] = None,
+ max_concurrent_calls: int = 5,
+ batch_size: int = 3
+ ) -> None:
+ self.docs_dir = docs_dir
+ self.logger = logger
+ self.max_concurrent_calls = max_concurrent_calls
+ self.batch_size = batch_size
+ self.bm25_index = None
+ self.document_map: Dict[str, Any] = {}
+ self.tokenized_facts: List[str] = []
+ self.bm25_index_file = self.docs_dir / "bm25_index.pkl"
+
+ async def _process_document_batch(self, doc_batch: List[Path]) -> None:
+ """Process a batch of documents in parallel"""
+ contents = []
+ for file_path in doc_batch:
+ try:
+ with open(file_path, 'r', encoding='utf-8') as f:
+ contents.append(f.read())
+ except Exception as e:
+ self.logger.error(f"Error reading {file_path}: {str(e)}")
+ contents.append("") # Add empty content to maintain batch alignment
+
+ prompt = """Given a documentation file, generate a list of atomic facts where each fact:
+1. Represents a single piece of knowledge
+2. Contains variations in terminology for the same concept
+3. References relevant code patterns if they exist
+4. Is written in a way that would match natural language queries
+
+Each fact should follow this format:
+: | |
+
+Example Facts:
+browser_config: Configure headless mode and browser type for AsyncWebCrawler | headless, browser_type, chromium, firefox | BrowserConfig(browser_type="chromium", headless=True)
+redis_connection: Redis client connection requires host and port configuration | redis setup, redis client, connection params | Redis(host='localhost', port=6379, db=0)
+pandas_filtering: Filter DataFrame rows using boolean conditions | dataframe filter, query, boolean indexing | df[df['column'] > 5]
+
+Wrap your response in ... tags.
+"""
+
+ # Prepare messages for batch processing
+ messages_list = [
+ [
+ {"role": "user", "content": f"{prompt}\n\nGenerate index for this documentation:\n\n{content}"}
+ ]
+ for content in contents if content
+ ]
+
+ try:
+ responses = batch_completion(
+ model="anthropic/claude-3-5-sonnet-latest",
+ messages=messages_list,
+ logger_fn=None
+ )
+
+ # Process responses and save index files
+ for response, file_path in zip(responses, doc_batch):
+ try:
+ index_content_match = re.search(
+ r'(.*?) ',
+ response.choices[0].message.content,
+ re.DOTALL
+ )
+ if not index_content_match:
+ self.logger.warning(f"No ... content found for {file_path}")
+ continue
+
+ index_content = re.sub(
+ r"\n\s*\n", "\n", index_content_match.group(1)
+ ).strip()
+ if index_content:
+ index_file = file_path.with_suffix('.q.md')
+ with open(index_file, 'w', encoding='utf-8') as f:
+ f.write(index_content)
+ self.logger.info(f"Created index file: {index_file}")
+ else:
+ self.logger.warning(f"No index content found in response for {file_path}")
+
+ except Exception as e:
+ self.logger.error(f"Error processing response for {file_path}: {str(e)}")
+
+ except Exception as e:
+ self.logger.error(f"Error in batch completion: {str(e)}")
+
+ def _validate_fact_line(self, line: str) -> Tuple[bool, Optional[str]]:
+ if "|" not in line:
+ return False, "Missing separator '|'"
+
+ parts = [p.strip() for p in line.split("|")]
+ if len(parts) != 3:
+ return False, f"Expected 3 parts, got {len(parts)}"
+
+ concept_part = parts[0]
+ if ":" not in concept_part:
+ return False, "Missing ':' in concept definition"
+
+ return True, None
+
+ def _load_or_create_token_cache(self, fact_file: Path) -> Dict:
+ """
+ Load token cache from .q.tokens if present and matching file hash.
+ Otherwise return a new structure with updated file-hash.
+ """
+ cache_file = fact_file.with_suffix(".q.tokens")
+ current_hash = _compute_file_hash(fact_file)
+
+ if cache_file.exists():
+ try:
+ with open(cache_file, "r") as f:
+ cache = json.load(f)
+ # If the hash matches, return it directly
+ if cache.get("content_hash") == current_hash:
+ return cache
+ # Otherwise, we signal that it's changed
+ self.logger.info(f"Hash changed for {fact_file}, reindex needed.")
+ except json.JSONDecodeError:
+ self.logger.warning(f"Corrupt token cache for {fact_file}, rebuilding.")
+ except Exception as e:
+ self.logger.warning(f"Error reading cache for {fact_file}: {str(e)}")
+
+ # Return a fresh cache
+ return {"facts": {}, "content_hash": current_hash}
+
+ def _save_token_cache(self, fact_file: Path, cache: Dict) -> None:
+ cache_file = fact_file.with_suffix(".q.tokens")
+ # Always ensure we're saving the correct file-hash
+ cache["content_hash"] = _compute_file_hash(fact_file)
+ with open(cache_file, "w") as f:
+ json.dump(cache, f)
+
+ def preprocess_text(self, text: str) -> List[str]:
+ parts = [x.strip() for x in text.split("|")] if "|" in text else [text]
+ # Remove : after the first word of parts[0]
+ parts[0] = re.sub(r"^(.*?):", r"\1", parts[0])
+
+ lemmatizer = WordNetLemmatizer()
+ stop_words = set(stopwords.words("english")) - {
+ "how", "what", "when", "where", "why", "which",
+ }
+
+ tokens = []
+ for part in parts:
+ if "(" in part and ")" in part:
+ code_tokens = re.findall(
+ r'[\w_]+(?=\()|[\w_]+(?==[\'"]{1}[\w_]+[\'"]{1})', part
+ )
+ tokens.extend(code_tokens)
+
+ words = word_tokenize(part.lower())
+ tokens.extend(
+ [
+ lemmatizer.lemmatize(token)
+ for token in words
+ if token not in stop_words
+ ]
+ )
+
+ return tokens
+
+ def maybe_load_bm25_index(self, clear_cache=False) -> bool:
+ """
+ Load existing BM25 index from disk, if present and clear_cache=False.
+ """
+ if not clear_cache and os.path.exists(self.bm25_index_file):
+ self.logger.info("Loading existing BM25 index from disk.")
+ with open(self.bm25_index_file, "rb") as f:
+ data = pickle.load(f)
+ self.tokenized_facts = data["tokenized_facts"]
+ self.bm25_index = data["bm25_index"]
+ return True
+ return False
+
+ def build_search_index(self, clear_cache=False) -> None:
+ """
+ Checks for new or modified .q.md files by comparing file-hash.
+ If none need reindexing and clear_cache is False, loads existing index if available.
+ Otherwise, reindexes only changed/new files and merges or creates a new index.
+ """
+ # If clear_cache is True, we skip partial logic: rebuild everything from scratch
+ if clear_cache:
+ self.logger.info("Clearing cache and rebuilding full search index.")
+ if self.bm25_index_file.exists():
+ self.bm25_index_file.unlink()
+
+ process = psutil.Process()
+ self.logger.info("Checking which .q.md files need (re)indexing...")
+
+ # Gather all .q.md files
+ q_files = [self.docs_dir / f for f in os.listdir(self.docs_dir) if f.endswith(".q.md")]
+
+ # We'll store known (unchanged) facts in these lists
+ existing_facts: List[str] = []
+ existing_tokens: List[List[str]] = []
+
+ # Keep track of invalid lines for logging
+ invalid_lines = []
+ needSet = [] # files that must be (re)indexed
+
+ for qf in q_files:
+ token_cache_file = qf.with_suffix(".q.tokens")
+
+ # If no .q.tokens or clear_cache is True → definitely reindex
+ if clear_cache or not token_cache_file.exists():
+ needSet.append(qf)
+ continue
+
+ # Otherwise, load the existing cache and compare hash
+ cache = self._load_or_create_token_cache(qf)
+ # If the .q.tokens was out of date (i.e. changed hash), we reindex
+ if len(cache["facts"]) == 0 or cache.get("content_hash") != _compute_file_hash(qf):
+ needSet.append(qf)
+ else:
+ # File is unchanged → retrieve cached token data
+ for line, cache_data in cache["facts"].items():
+ existing_facts.append(line)
+ existing_tokens.append(cache_data["tokens"])
+ self.document_map[line] = qf # track the doc for that fact
+
+ if not needSet and not clear_cache:
+ # If no file needs reindexing, try loading existing index
+ if self.maybe_load_bm25_index(clear_cache=False):
+ self.logger.info("No new/changed .q.md files found. Using existing BM25 index.")
+ return
+ else:
+ # If there's no existing index, we must build a fresh index from the old caches
+ self.logger.info("No existing BM25 index found. Building from cached facts.")
+ if existing_facts:
+ self.logger.info(f"Building BM25 index with {len(existing_facts)} cached facts.")
+ self.bm25_index = BM25Okapi(existing_tokens)
+ self.tokenized_facts = existing_facts
+ with open(self.bm25_index_file, "wb") as f:
+ pickle.dump({
+ "bm25_index": self.bm25_index,
+ "tokenized_facts": self.tokenized_facts
+ }, f)
+ else:
+ self.logger.warning("No facts found at all. Index remains empty.")
+ return
+
+ # ----------------------------------------------------- /Users/unclecode/.crawl4ai/docs/14_proxy_security.q.q.tokens '/Users/unclecode/.crawl4ai/docs/14_proxy_security.q.md'
+ # If we reach here, we have new or changed .q.md files
+ # We'll parse them, reindex them, and then combine with existing_facts
+ # -----------------------------------------------------
+
+ self.logger.info(f"{len(needSet)} file(s) need reindexing. Parsing now...")
+
+ # 1) Parse the new or changed .q.md files
+ new_facts = []
+ new_tokens = []
+ with tqdm(total=len(needSet), desc="Indexing changed files") as file_pbar:
+ for file in needSet:
+ # We'll build up a fresh cache
+ fresh_cache = {"facts": {}, "content_hash": _compute_file_hash(file)}
+ try:
+ with open(file, "r", encoding="utf-8") as f_obj:
+ content = f_obj.read().strip()
+ lines = [l.strip() for l in content.split("\n") if l.strip()]
+
+ for line in lines:
+ is_valid, error = self._validate_fact_line(line)
+ if not is_valid:
+ invalid_lines.append((file, line, error))
+ continue
+
+ tokens = self.preprocess_text(line)
+ fresh_cache["facts"][line] = {
+ "tokens": tokens,
+ "added": time.time(),
+ }
+ new_facts.append(line)
+ new_tokens.append(tokens)
+ self.document_map[line] = file
+
+ # Save the new .q.tokens with updated hash
+ self._save_token_cache(file, fresh_cache)
+
+ mem_usage = process.memory_info().rss / 1024 / 1024
+ self.logger.debug(f"Memory usage after {file.name}: {mem_usage:.2f}MB")
+
+ except Exception as e:
+ self.logger.error(f"Error processing {file}: {str(e)}")
+
+ file_pbar.update(1)
+
+ if invalid_lines:
+ self.logger.warning(f"Found {len(invalid_lines)} invalid fact lines:")
+ for file, line, error in invalid_lines:
+ self.logger.warning(f"{file}: {error} in line: {line[:50]}...")
+
+ # 2) Merge newly tokenized facts with the existing ones
+ all_facts = existing_facts + new_facts
+ all_tokens = existing_tokens + new_tokens
+
+ # 3) Build BM25 index from combined facts
+ self.logger.info(f"Building BM25 index with {len(all_facts)} total facts (old + new).")
+ self.bm25_index = BM25Okapi(all_tokens)
+ self.tokenized_facts = all_facts
+
+ # 4) Save the updated BM25 index to disk
+ with open(self.bm25_index_file, "wb") as f:
+ pickle.dump({
+ "bm25_index": self.bm25_index,
+ "tokenized_facts": self.tokenized_facts
+ }, f)
+
+ final_mem = process.memory_info().rss / 1024 / 1024
+ self.logger.info(f"Search index updated. Final memory usage: {final_mem:.2f}MB")
+
+ async def generate_index_files(self, force_generate_facts: bool = False, clear_bm25_cache: bool = False) -> None:
+ """
+ Generate index files for all documents in parallel batches
+
+ Args:
+ force_generate_facts (bool): If True, regenerate indexes even if they exist
+ clear_bm25_cache (bool): If True, clear existing BM25 index cache
+ """
+ self.logger.info("Starting index generation for documentation files.")
+
+ md_files = [
+ self.docs_dir / f for f in os.listdir(self.docs_dir)
+ if f.endswith('.md') and not any(f.endswith(x) for x in ['.q.md', '.xs.md'])
+ ]
+
+ # Filter out files that already have .q files unless force=True
+ if not force_generate_facts:
+ md_files = [
+ f for f in md_files
+ if not (self.docs_dir / f.name.replace('.md', '.q.md')).exists()
+ ]
+
+ if not md_files:
+ self.logger.info("All index files exist. Use force=True to regenerate.")
+ else:
+ # Process documents in batches
+ for i in range(0, len(md_files), self.batch_size):
+ batch = md_files[i:i + self.batch_size]
+ self.logger.info(f"Processing batch {i//self.batch_size + 1}/{(len(md_files)//self.batch_size) + 1}")
+ await self._process_document_batch(batch)
+
+ self.logger.info("Index generation complete, building/updating search index.")
+ self.build_search_index(clear_cache=clear_bm25_cache)
+
+ def generate(self, sections: List[str], mode: str = "extended") -> str:
+ # Get all markdown files
+ all_files = glob.glob(str(self.docs_dir / "[0-9]*.md")) + \
+ glob.glob(str(self.docs_dir / "[0-9]*.xs.md"))
+
+ # Extract base names without extensions
+ base_docs = {Path(f).name.split('.')[0] for f in all_files
+ if not Path(f).name.endswith('.q.md')}
+
+ # Filter by sections if provided
+ if sections:
+ base_docs = {doc for doc in base_docs
+ if any(section.lower() in doc.lower() for section in sections)}
+
+ # Get file paths based on mode
+ files = []
+ for doc in sorted(base_docs, key=lambda x: int(x.split('_')[0]) if x.split('_')[0].isdigit() else 999999):
+ if mode == "condensed":
+ xs_file = self.docs_dir / f"{doc}.xs.md"
+ regular_file = self.docs_dir / f"{doc}.md"
+ files.append(str(xs_file if xs_file.exists() else regular_file))
+ else:
+ files.append(str(self.docs_dir / f"{doc}.md"))
+
+ # Read and format content
+ content = []
+ for file in files:
+ try:
+ with open(file, 'r', encoding='utf-8') as f:
+ fname = Path(file).name
+ content.append(f"{'#'*20}\n# {fname}\n{'#'*20}\n\n{f.read()}")
+ except Exception as e:
+ self.logger.error(f"Error reading {file}: {str(e)}")
+
+ return "\n\n---\n\n".join(content) if content else ""
+
+ def search(self, query: str, top_k: int = 5) -> str:
+ if not self.bm25_index:
+ return "No search index available. Call build_search_index() first."
+
+ query_tokens = self.preprocess_text(query)
+ doc_scores = self.bm25_index.get_scores(query_tokens)
+
+ mean_score = np.mean(doc_scores)
+ std_score = np.std(doc_scores)
+ score_threshold = mean_score + (0.25 * std_score)
+
+ file_data = self._aggregate_search_scores(
+ doc_scores=doc_scores,
+ score_threshold=score_threshold,
+ query_tokens=query_tokens,
+ )
+
+ ranked_files = sorted(
+ file_data.items(),
+ key=lambda x: (
+ x[1]["code_match_score"] * 2.0
+ + x[1]["match_count"] * 1.5
+ + x[1]["total_score"]
+ ),
+ reverse=True,
+ )[:top_k]
+
+ results = []
+ for file, _ in ranked_files:
+ main_doc = str(file).replace(".q.md", ".md")
+ if os.path.exists(self.docs_dir / main_doc):
+ with open(self.docs_dir / main_doc, "r", encoding='utf-8') as f:
+ only_file_name = main_doc.split("/")[-1]
+ content = [
+ "#" * 20,
+ f"# {only_file_name}",
+ "#" * 20,
+ "",
+ f.read()
+ ]
+ results.append("\n".join(content))
+
+ return "\n\n---\n\n".join(results)
+
+ def _aggregate_search_scores(
+ self, doc_scores: List[float], score_threshold: float, query_tokens: List[str]
+ ) -> Dict:
+ file_data = {}
+
+ for idx, score in enumerate(doc_scores):
+ if score <= score_threshold:
+ continue
+
+ fact = self.tokenized_facts[idx]
+ file_path = self.document_map[fact]
+
+ if file_path not in file_data:
+ file_data[file_path] = {
+ "total_score": 0,
+ "match_count": 0,
+ "code_match_score": 0,
+ "matched_facts": [],
+ }
+
+ components = fact.split("|") if "|" in fact else [fact]
+
+ code_match_score = 0
+ if len(components) == 3:
+ code_ref = components[2].strip()
+ code_tokens = self.preprocess_text(code_ref)
+ code_match_score = len(set(query_tokens) & set(code_tokens)) / len(query_tokens)
+
+ file_data[file_path]["total_score"] += score
+ file_data[file_path]["match_count"] += 1
+ file_data[file_path]["code_match_score"] = max(
+ file_data[file_path]["code_match_score"], code_match_score
+ )
+ file_data[file_path]["matched_facts"].append(fact)
+
+ return file_data
+
+ def refresh_index(self) -> None:
+ """Convenience method for a full rebuild."""
+ self.build_search_index(clear_cache=True)
diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..89e5e34e624c7212cffe1e19cb4e00bbcf2bfa5f
--- /dev/null
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -0,0 +1,225 @@
+from abc import ABC, abstractmethod
+from typing import Optional, Dict, Any, Tuple
+from .models import MarkdownGenerationResult
+from .html2text import CustomHTML2Text
+from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter
+import re
+from urllib.parse import urljoin
+
+# Pre-compile the regex pattern
+LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
+
+def fast_urljoin(base: str, url: str) -> str:
+ """Fast URL joining for common cases."""
+ if url.startswith(('http://', 'https://', 'mailto:', '//')):
+ return url
+ if url.startswith('/'):
+ # Handle absolute paths
+ if base.endswith('/'):
+ return base[:-1] + url
+ return base + url
+ return urljoin(base, url)
+
+class MarkdownGenerationStrategy(ABC):
+ """Abstract base class for markdown generation strategies."""
+ def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
+ self.content_filter = content_filter
+ self.options = options or {}
+
+ @abstractmethod
+ def generate_markdown(self,
+ cleaned_html: str,
+ base_url: str = "",
+ html2text_options: Optional[Dict[str, Any]] = None,
+ content_filter: Optional[RelevantContentFilter] = None,
+ citations: bool = True,
+ **kwargs) -> MarkdownGenerationResult:
+ """Generate markdown from cleaned HTML."""
+ pass
+
+class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
+ """
+ Default implementation of markdown generation strategy.
+
+ How it works:
+ 1. Generate raw markdown from cleaned HTML.
+ 2. Convert links to citations.
+ 3. Generate fit markdown if content filter is provided.
+ 4. Return MarkdownGenerationResult.
+
+ Args:
+ content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
+ options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
+
+ Returns:
+ MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
+ """
+ def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
+ super().__init__(content_filter, options)
+
+ def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
+ """
+ Convert links in markdown to citations.
+
+ How it works:
+ 1. Find all links in the markdown.
+ 2. Convert links to citations.
+ 3. Return converted markdown and references markdown.
+
+ Note:
+ This function uses a regex pattern to find links in markdown.
+
+ Args:
+ markdown (str): Markdown text.
+ base_url (str): Base URL for URL joins.
+
+ Returns:
+ Tuple[str, str]: Converted markdown and references markdown.
+ """
+ link_map = {}
+ url_cache = {} # Cache for URL joins
+ parts = []
+ last_end = 0
+ counter = 1
+
+ for match in LINK_PATTERN.finditer(markdown):
+ parts.append(markdown[last_end:match.start()])
+ text, url, title = match.groups()
+
+ # Use cached URL if available, otherwise compute and cache
+ if base_url and not url.startswith(('http://', 'https://', 'mailto:')):
+ if url not in url_cache:
+ url_cache[url] = fast_urljoin(base_url, url)
+ url = url_cache[url]
+
+ if url not in link_map:
+ desc = []
+ if title: desc.append(title)
+ if text and text != title: desc.append(text)
+ link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
+ counter += 1
+
+ num = link_map[url][0]
+ parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]")
+ last_end = match.end()
+
+ parts.append(markdown[last_end:])
+ converted_text = ''.join(parts)
+
+ # Pre-build reference strings
+ references = ["\n\n## References\n\n"]
+ references.extend(
+ f"⟨{num}⟩ {url}{desc}\n"
+ for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
+ )
+
+ return converted_text, ''.join(references)
+
+ def generate_markdown(self,
+ cleaned_html: str,
+ base_url: str = "",
+ html2text_options: Optional[Dict[str, Any]] = None,
+ options: Optional[Dict[str, Any]] = None,
+ content_filter: Optional[RelevantContentFilter] = None,
+ citations: bool = True,
+ **kwargs) -> MarkdownGenerationResult:
+ """
+ Generate markdown with citations from cleaned HTML.
+
+ How it works:
+ 1. Generate raw markdown from cleaned HTML.
+ 2. Convert links to citations.
+ 3. Generate fit markdown if content filter is provided.
+ 4. Return MarkdownGenerationResult.
+
+ Args:
+ cleaned_html (str): Cleaned HTML content.
+ base_url (str): Base URL for URL joins.
+ html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
+ options (Optional[Dict[str, Any]]): Additional options for markdown generation.
+ content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
+ citations (bool): Whether to generate citations.
+
+ Returns:
+ MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
+ """
+ try:
+ # Initialize HTML2Text with default options for better conversion
+ h = CustomHTML2Text(baseurl=base_url)
+ default_options = {
+ 'body_width': 0, # Disable text wrapping
+ 'ignore_emphasis': False,
+ 'ignore_links': False,
+ 'ignore_images': False,
+ 'protect_links': True,
+ 'single_line_break': True,
+ 'mark_code': True,
+ 'escape_snob': False
+ }
+
+ # Update with custom options if provided
+ if html2text_options:
+ default_options.update(html2text_options)
+ elif options:
+ default_options.update(options)
+ elif self.options:
+ default_options.update(self.options)
+
+ h.update_params(**default_options)
+
+ # Ensure we have valid input
+ if not cleaned_html:
+ cleaned_html = ""
+ elif not isinstance(cleaned_html, str):
+ cleaned_html = str(cleaned_html)
+
+ # Generate raw markdown
+ try:
+ raw_markdown = h.handle(cleaned_html)
+ except Exception as e:
+ raw_markdown = f"Error converting HTML to markdown: {str(e)}"
+
+ raw_markdown = raw_markdown.replace(' ```', '```')
+
+ # Convert links to citations
+ markdown_with_citations: str = raw_markdown
+ references_markdown: str = ""
+ if citations:
+ try:
+ markdown_with_citations, references_markdown = self.convert_links_to_citations(
+ raw_markdown, base_url
+ )
+ except Exception as e:
+ markdown_with_citations = raw_markdown
+ references_markdown = f"Error generating citations: {str(e)}"
+
+ # Generate fit markdown if content filter is provided
+ fit_markdown: Optional[str] = ""
+ filtered_html: Optional[str] = ""
+ if content_filter or self.content_filter:
+ try:
+ content_filter = content_filter or self.content_filter
+ filtered_html = content_filter.filter_content(cleaned_html)
+ filtered_html = '\n'.join('{}
'.format(s) for s in filtered_html)
+ fit_markdown = h.handle(filtered_html)
+ except Exception as e:
+ fit_markdown = f"Error generating fit markdown: {str(e)}"
+ filtered_html = ""
+
+ return MarkdownGenerationResult(
+ raw_markdown=raw_markdown or "",
+ markdown_with_citations=markdown_with_citations or "",
+ references_markdown=references_markdown or "",
+ fit_markdown=fit_markdown or "",
+ fit_html=filtered_html or "",
+ )
+ except Exception as e:
+ # If anything fails, return empty strings with error message
+ error_msg = f"Error in markdown generation: {str(e)}"
+ return MarkdownGenerationResult(
+ raw_markdown=error_msg,
+ markdown_with_citations=error_msg,
+ references_markdown="",
+ fit_markdown="",
+ fit_html="",
+ )
diff --git a/crawl4ai/migrations.py b/crawl4ai/migrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..3386b0fb433b4ba116476ee225606eab2cb3a956
--- /dev/null
+++ b/crawl4ai/migrations.py
@@ -0,0 +1,168 @@
+import os
+import asyncio
+import logging
+from pathlib import Path
+import aiosqlite
+from typing import Optional
+import xxhash
+import aiofiles
+import shutil
+import time
+from datetime import datetime
+from .async_logger import AsyncLogger, LogLevel
+
+# Initialize logger
+logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
+
+# logging.basicConfig(level=logging.INFO)
+# logger = logging.getLogger(__name__)
+
+class DatabaseMigration:
+ def __init__(self, db_path: str):
+ self.db_path = db_path
+ self.content_paths = self._ensure_content_dirs(os.path.dirname(db_path))
+
+ def _ensure_content_dirs(self, base_path: str) -> dict:
+ dirs = {
+ 'html': 'html_content',
+ 'cleaned': 'cleaned_html',
+ 'markdown': 'markdown_content',
+ 'extracted': 'extracted_content',
+ 'screenshots': 'screenshots'
+ }
+ content_paths = {}
+ for key, dirname in dirs.items():
+ path = os.path.join(base_path, dirname)
+ os.makedirs(path, exist_ok=True)
+ content_paths[key] = path
+ return content_paths
+
+ def _generate_content_hash(self, content: str) -> str:
+ x = xxhash.xxh64()
+ x.update(content.encode())
+ content_hash = x.hexdigest()
+ return content_hash
+ # return hashlib.sha256(content.encode()).hexdigest()
+
+ async def _store_content(self, content: str, content_type: str) -> str:
+ if not content:
+ return ""
+
+ content_hash = self._generate_content_hash(content)
+ file_path = os.path.join(self.content_paths[content_type], content_hash)
+
+ if not os.path.exists(file_path):
+ async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
+ await f.write(content)
+
+ return content_hash
+
+ async def migrate_database(self):
+ """Migrate existing database to file-based storage"""
+ # logger.info("Starting database migration...")
+ logger.info("Starting database migration...", tag="INIT")
+
+ try:
+ async with aiosqlite.connect(self.db_path) as db:
+ # Get all rows
+ async with db.execute(
+ '''SELECT url, html, cleaned_html, markdown,
+ extracted_content, screenshot FROM crawled_data'''
+ ) as cursor:
+ rows = await cursor.fetchall()
+
+ migrated_count = 0
+ for row in rows:
+ url, html, cleaned_html, markdown, extracted_content, screenshot = row
+
+ # Store content in files and get hashes
+ html_hash = await self._store_content(html, 'html')
+ cleaned_hash = await self._store_content(cleaned_html, 'cleaned')
+ markdown_hash = await self._store_content(markdown, 'markdown')
+ extracted_hash = await self._store_content(extracted_content, 'extracted')
+ screenshot_hash = await self._store_content(screenshot, 'screenshots')
+
+ # Update database with hashes
+ await db.execute('''
+ UPDATE crawled_data
+ SET html = ?,
+ cleaned_html = ?,
+ markdown = ?,
+ extracted_content = ?,
+ screenshot = ?
+ WHERE url = ?
+ ''', (html_hash, cleaned_hash, markdown_hash,
+ extracted_hash, screenshot_hash, url))
+
+ migrated_count += 1
+ if migrated_count % 100 == 0:
+ logger.info(f"Migrated {migrated_count} records...", tag="INIT")
+
+
+ await db.commit()
+ logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE")
+
+ except Exception as e:
+ # logger.error(f"Migration failed: {e}")
+ logger.error(
+ message="Migration failed: {error}",
+ tag="ERROR",
+ params={"error": str(e)}
+ )
+ raise e
+
+async def backup_database(db_path: str) -> str:
+ """Create backup of existing database"""
+ if not os.path.exists(db_path):
+ logger.info("No existing database found. Skipping backup.", tag="INIT")
+ return None
+
+ # Create backup with timestamp
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+ backup_path = f"{db_path}.backup_{timestamp}"
+
+ try:
+ # Wait for any potential write operations to finish
+ await asyncio.sleep(1)
+
+ # Create backup
+ shutil.copy2(db_path, backup_path)
+ logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE")
+ return backup_path
+ except Exception as e:
+ # logger.error(f"Backup failed: {e}")
+ logger.error(
+ message="Migration failed: {error}",
+ tag="ERROR",
+ params={"error": str(e)}
+ )
+ raise e
+
+async def run_migration(db_path: Optional[str] = None):
+ """Run database migration"""
+ if db_path is None:
+ db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db")
+
+ if not os.path.exists(db_path):
+ logger.info("No existing database found. Skipping migration.", tag="INIT")
+ return
+
+ # Create backup first
+ backup_path = await backup_database(db_path)
+ if not backup_path:
+ return
+
+ migration = DatabaseMigration(db_path)
+ await migration.migrate_database()
+
+def main():
+ """CLI entry point for migration"""
+ import argparse
+ parser = argparse.ArgumentParser(description='Migrate Crawl4AI database to file-based storage')
+ parser.add_argument('--db-path', help='Custom database path')
+ args = parser.parse_args()
+
+ asyncio.run(run_migration(args.db_path))
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1872d7e48decbb1860caf25684a6d27004f19e0
--- /dev/null
+++ b/crawl4ai/model_loader.py
@@ -0,0 +1,256 @@
+from functools import lru_cache
+from pathlib import Path
+import subprocess, os
+import shutil
+import tarfile
+from .model_loader import *
+import argparse
+import urllib.request
+from crawl4ai.config import MODEL_REPO_BRANCH
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+@lru_cache()
+def get_available_memory(device):
+ import torch
+ if device.type == 'cuda':
+ return torch.cuda.get_device_properties(device).total_memory
+ elif device.type == 'mps':
+ return 48 * 1024 ** 3 # Assuming 8GB for MPS, as a conservative estimate
+ else:
+ return 0
+
+@lru_cache()
+def calculate_batch_size(device):
+ available_memory = get_available_memory(device)
+
+ if device.type == 'cpu':
+ return 16
+ elif device.type in ['cuda', 'mps']:
+ # Adjust these thresholds based on your model size and available memory
+ if available_memory >= 31 * 1024 ** 3: # > 32GB
+ return 256
+ elif available_memory >= 15 * 1024 ** 3: # > 16GB to 32GB
+ return 128
+ elif available_memory >= 8 * 1024 ** 3: # 8GB to 16GB
+ return 64
+ else:
+ return 32
+ else:
+ return 16 # Default batch size
+
+@lru_cache()
+def get_device():
+ import torch
+ if torch.cuda.is_available():
+ device = torch.device('cuda')
+ elif torch.backends.mps.is_available():
+ device = torch.device('mps')
+ else:
+ device = torch.device('cpu')
+ return device
+
+def set_model_device(model):
+ device = get_device()
+ model.to(device)
+ return model, device
+
+@lru_cache()
+def get_home_folder():
+ home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
+ os.makedirs(home_folder, exist_ok=True)
+ os.makedirs(f"{home_folder}/cache", exist_ok=True)
+ os.makedirs(f"{home_folder}/models", exist_ok=True)
+ return home_folder
+
+@lru_cache()
+def load_bert_base_uncased():
+ from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
+ model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
+ model.eval()
+ model, device = set_model_device(model)
+ return tokenizer, model
+
+@lru_cache()
+def load_HF_embedding_model(model_name="BAAI/bge-small-en-v1.5") -> tuple:
+ """Load the Hugging Face model for embedding.
+
+ Args:
+ model_name (str, optional): The model name to load. Defaults to "BAAI/bge-small-en-v1.5".
+
+ Returns:
+ tuple: The tokenizer and model.
+ """
+ from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
+ tokenizer = AutoTokenizer.from_pretrained(model_name, resume_download=None)
+ model = AutoModel.from_pretrained(model_name, resume_download=None)
+ model.eval()
+ model, device = set_model_device(model)
+ return tokenizer, model
+
+@lru_cache()
+def load_text_classifier():
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
+ from transformers import pipeline
+ import torch
+
+ tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
+ model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
+ model.eval()
+ model, device = set_model_device(model)
+ pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
+ return pipe
+
+@lru_cache()
+def load_text_multilabel_classifier():
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
+ import numpy as np
+ from scipy.special import expit
+ import torch
+
+ # # Check for available device: CUDA, MPS (for Apple Silicon), or CPU
+ # if torch.cuda.is_available():
+ # device = torch.device("cuda")
+ # elif torch.backends.mps.is_available():
+ # device = torch.device("mps")
+ # else:
+ # device = torch.device("cpu")
+ # # return load_spacy_model(), torch.device("cpu")
+
+
+ MODEL = "cardiffnlp/tweet-topic-21-multi"
+ tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
+ model.eval()
+ model, device = set_model_device(model)
+ class_mapping = model.config.id2label
+
+ def _classifier(texts, threshold=0.5, max_length=64):
+ tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
+ tokens = {key: val.to(device) for key, val in tokens.items()} # Move tokens to the selected device
+
+ with torch.no_grad():
+ output = model(**tokens)
+
+ scores = output.logits.detach().cpu().numpy()
+ scores = expit(scores)
+ predictions = (scores >= threshold) * 1
+
+ batch_labels = []
+ for prediction in predictions:
+ labels = [class_mapping[i] for i, value in enumerate(prediction) if value == 1]
+ batch_labels.append(labels)
+
+ return batch_labels
+
+ return _classifier, device
+
+@lru_cache()
+def load_nltk_punkt():
+ import nltk
+ try:
+ nltk.data.find('tokenizers/punkt')
+ except LookupError:
+ nltk.download('punkt')
+ return nltk.data.find('tokenizers/punkt')
+
+@lru_cache()
+def load_spacy_model():
+ import spacy
+ name = "models/reuters"
+ home_folder = get_home_folder()
+ model_folder = Path(home_folder) / name
+
+ # Check if the model directory already exists
+ if not (model_folder.exists() and any(model_folder.iterdir())):
+ repo_url = "https://github.com/unclecode/crawl4ai.git"
+ branch = MODEL_REPO_BRANCH
+ repo_folder = Path(home_folder) / "crawl4ai"
+
+ print("[LOG] ⏬ Downloading Spacy model for the first time...")
+
+ # Remove existing repo folder if it exists
+ if repo_folder.exists():
+ try:
+ shutil.rmtree(repo_folder)
+ if model_folder.exists():
+ shutil.rmtree(model_folder)
+ except PermissionError:
+ print("[WARNING] Unable to remove existing folders. Please manually delete the following folders and try again:")
+ print(f"- {repo_folder}")
+ print(f"- {model_folder}")
+ return None
+
+ try:
+ # Clone the repository
+ subprocess.run(
+ ["git", "clone", "-b", branch, repo_url, str(repo_folder)],
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ check=True
+ )
+
+ # Create the models directory if it doesn't exist
+ models_folder = Path(home_folder) / "models"
+ models_folder.mkdir(parents=True, exist_ok=True)
+
+ # Copy the reuters model folder to the models directory
+ source_folder = repo_folder / "models" / "reuters"
+ shutil.copytree(source_folder, model_folder)
+
+ # Remove the cloned repository
+ shutil.rmtree(repo_folder)
+
+ print("[LOG] ✅ Spacy Model downloaded successfully")
+ except subprocess.CalledProcessError as e:
+ print(f"An error occurred while cloning the repository: {e}")
+ return None
+ except Exception as e:
+ print(f"An error occurred: {e}")
+ return None
+
+ try:
+ return spacy.load(str(model_folder))
+ except Exception as e:
+ print(f"Error loading spacy model: {e}")
+ return None
+
+def download_all_models(remove_existing=False):
+ """Download all models required for Crawl4AI."""
+ if remove_existing:
+ print("[LOG] Removing existing models...")
+ home_folder = get_home_folder()
+ model_folders = [
+ os.path.join(home_folder, "models/reuters"),
+ os.path.join(home_folder, "models"),
+ ]
+ for folder in model_folders:
+ if Path(folder).exists():
+ shutil.rmtree(folder)
+ print("[LOG] Existing models removed.")
+
+ # Load each model to trigger download
+ # print("[LOG] Downloading BERT Base Uncased...")
+ # load_bert_base_uncased()
+ # print("[LOG] Downloading BGE Small EN v1.5...")
+ # load_bge_small_en_v1_5()
+ # print("[LOG] Downloading ONNX model...")
+ # load_onnx_all_MiniLM_l6_v2()
+ print("[LOG] Downloading text classifier...")
+ _, device = load_text_multilabel_classifier()
+ print(f"[LOG] Text classifier loaded on {device}")
+ print("[LOG] Downloading custom NLTK Punkt model...")
+ load_nltk_punkt()
+ print("[LOG] ✅ All models downloaded successfully.")
+
+def main():
+ print("[LOG] Welcome to the Crawl4AI Model Downloader!")
+ print("[LOG] This script will download all the models required for Crawl4AI.")
+ parser = argparse.ArgumentParser(description="Crawl4AI Model Downloader")
+ parser.add_argument('--remove-existing', action='store_true', help="Remove existing models before downloading")
+ args = parser.parse_args()
+
+ download_all_models(remove_existing=args.remove_existing)
+
+if __name__ == "__main__":
+ main()
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fb362a349fae94fdf96f8f5b1518371930b6e7b
--- /dev/null
+++ b/crawl4ai/models.py
@@ -0,0 +1,61 @@
+from pydantic import BaseModel, HttpUrl
+from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
+from dataclasses import dataclass
+from .ssl_certificate import SSLCertificate
+
+@dataclass
+class TokenUsage:
+ completion_tokens: int = 0
+ prompt_tokens: int = 0
+ total_tokens: int = 0
+ completion_tokens_details: Optional[dict] = None
+ prompt_tokens_details: Optional[dict] = None
+
+
+class UrlModel(BaseModel):
+ url: HttpUrl
+ forced: bool = False
+
+class MarkdownGenerationResult(BaseModel):
+ raw_markdown: str
+ markdown_with_citations: str
+ references_markdown: str
+ fit_markdown: Optional[str] = None
+ fit_html: Optional[str] = None
+
+class CrawlResult(BaseModel):
+ url: str
+ html: str
+ success: bool
+ cleaned_html: Optional[str] = None
+ media: Dict[str, List[Dict]] = {}
+ links: Dict[str, List[Dict]] = {}
+ downloaded_files: Optional[List[str]] = None
+ screenshot: Optional[str] = None
+ pdf : Optional[bytes] = None
+ markdown: Optional[Union[str, MarkdownGenerationResult]] = None
+ markdown_v2: Optional[MarkdownGenerationResult] = None
+ fit_markdown: Optional[str] = None
+ fit_html: Optional[str] = None
+ extracted_content: Optional[str] = None
+ metadata: Optional[dict] = None
+ error_message: Optional[str] = None
+ session_id: Optional[str] = None
+ response_headers: Optional[dict] = None
+ status_code: Optional[int] = None
+ ssl_certificate: Optional[SSLCertificate] = None
+ class Config:
+ arbitrary_types_allowed = True
+
+class AsyncCrawlResponse(BaseModel):
+ html: str
+ response_headers: Dict[str, str]
+ status_code: int
+ screenshot: Optional[str] = None
+ pdf_data: Optional[bytes] = None
+ get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
+ downloaded_files: Optional[List[str]] = None
+ ssl_certificate: Optional[SSLCertificate] = None
+
+ class Config:
+ arbitrary_types_allowed = True
diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a963e6d9215da02def72b686cd87abb8ebfed5b
--- /dev/null
+++ b/crawl4ai/prompts.py
@@ -0,0 +1,204 @@
+PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage:
+{URL}
+
+And here is the cleaned HTML content of that webpage:
+
+{HTML}
+
+
+Your task is to break down this HTML content into semantically relevant blocks, and for each block, generate a JSON object with the following keys:
+
+- index: an integer representing the index of the block in the content
+- tags: a list of semantic tags that are relevant to the content of the block
+- content: a list of strings containing the text content of the block
+- questions: a list of 3 questions that a user may ask about the content in this block
+
+To generate the JSON objects:
+
+1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks.
+
+2. For each block:
+ a. Assign it an index based on its order in the content.
+ b. Analyze the content and generate a list of relevant semantic tags that describe what the block is about.
+ c. Extract the text content, clean it up if needed, and store it as a list of strings in the "content" field.
+ d. Come up with 3 questions that a user might ask about this specific block of content, based on the tags and content. The questions should be relevant and answerable by the content in the block.
+
+3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
+
+4. Double-check that each JSON object includes all required keys (index, tags, content, questions) and that the values are in the expected format (integer, list of strings, etc.).
+
+5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
+
+6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
+
+Please provide your output within tags, like this:
+
+
+[{
+ "index": 0,
+ "tags": ["introduction", "overview"],
+ "content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."],
+ "questions": [
+ "What is the main topic of this article?",
+ "What can I expect to learn from reading this article?",
+ "Is this article suitable for beginners or experts in the field?"
+ ]
+},
+{
+ "index": 1,
+ "tags": ["history", "background"],
+ "content": ["This is the second paragraph, which delves into the history and background of the topic.",
+ "It provides context and sets the stage for the rest of the article."],
+ "questions": [
+ "What historical events led to the development of this topic?",
+ "How has the understanding of this topic evolved over time?",
+ "What are some key milestones in the history of this topic?"
+ ]
+}]
+
+
+Remember, the output should be a complete, parsable JSON wrapped in tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
+
+PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage:
+{URL}
+
+And here is the cleaned HTML content of that webpage:
+
+{HTML}
+
+
+Your task is to break down this HTML content into semantically relevant blocks, and for each block, generate a JSON object with the following keys:
+
+- index: an integer representing the index of the block in the content
+- content: a list of strings containing the text content of the block
+
+To generate the JSON objects:
+
+1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks.
+
+2. For each block:
+ a. Assign it an index based on its order in the content.
+ b. Analyze the content and generate ONE semantic tag that describe what the block is about.
+ c. Extract the text content, EXACTLY SAME AS THE GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
+
+3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
+
+4. Double-check that each JSON object includes all required keys (index, tag, content) and that the values are in the expected format (integer, list of strings, etc.).
+
+5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
+
+6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
+
+7. Never alter the extracted content, just copy and paste it as it is.
+
+Please provide your output within tags, like this:
+
+
+[{
+ "index": 0,
+ "tags": ["introduction"],
+ "content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."]
+},
+{
+ "index": 1,
+ "tags": ["background"],
+ "content": ["This is the second paragraph, which delves into the history and background of the topic.",
+ "It provides context and sets the stage for the rest of the article."]
+}]
+
+
+Remember, the output should be a complete, parsable JSON wrapped in tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
+
+PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION = """Here is the URL of the webpage:
+{URL}
+
+And here is the cleaned HTML content of that webpage:
+
+{HTML}
+
+
+Your task is to break down this HTML content into semantically relevant blocks, following the provided user's REQUEST, and for each block, generate a JSON object with the following keys:
+
+- index: an integer representing the index of the block in the content
+- content: a list of strings containing the text content of the block
+
+This is the user's REQUEST, pay attention to it:
+
+{REQUEST}
+
+
+To generate the JSON objects:
+
+1. Carefully read through the HTML content and identify logical breaks or shifts in the content that would warrant splitting it into separate blocks.
+
+2. For each block:
+ a. Assign it an index based on its order in the content.
+ b. Analyze the content and generate ONE semantic tag that describe what the block is about.
+ c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field.
+
+3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content.
+
+4. Double-check that each JSON object includes all required keys (index, tag, content) and that the values are in the expected format (integer, list of strings, etc.).
+
+5. Make sure the generated JSON is complete and parsable, with no errors or omissions.
+
+6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
+
+7. Never alter the extracted content, just copy and paste it as it is.
+
+Please provide your output within tags, like this:
+
+
+[{
+ "index": 0,
+ "tags": ["introduction"],
+ "content": ["This is the first paragraph of the article, which provides an introduction and overview of the main topic."]
+},
+{
+ "index": 1,
+ "tags": ["background"],
+ "content": ["This is the second paragraph, which delves into the history and background of the topic.",
+ "It provides context and sets the stage for the rest of the article."]
+}]
+
+
+**Make sure to follow the user instruction to extract blocks aligin with the instruction.**
+
+Remember, the output should be a complete, parsable JSON wrapped in tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order."""
+
+PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION = """Here is the content from the URL:
+{URL}
+
+
+{HTML}
+
+
+The user has made the following request for what information to extract from the above content:
+
+
+{REQUEST}
+
+
+
+{SCHEMA}
+
+
+Please carefully read the URL content and the user's request. If the user provided a desired JSON schema in the above, extract the requested information from the URL content according to that schema. If no schema was provided, infer an appropriate JSON schema based on the user's request that will best capture the key information they are looking for.
+
+Extraction instructions:
+Return the extracted information as a list of JSON objects, with each object in the list corresponding to a block of content from the URL, in the same order as it appears on the page. Wrap the entire JSON list in ... XML tags.
+
+Quality Reflection:
+Before outputting your final answer, double check that the JSON you are returning is complete, containing all the information requested by the user, and is valid JSON that could be parsed by json.loads() with no errors or omissions. The outputted JSON objects should fully match the schema, either provided or inferred.
+
+Quality Score:
+After reflecting, score the quality and completeness of the JSON data you are about to return on a scale of 1 to 5. Write the score inside tags.
+
+Avoid Common Mistakes:
+- Do NOT add any comments using "//" or "#" in the JSON output. It causes parsing errors.
+- Make sure the JSON is properly formatted with curly braces, square brackets, and commas in the right places.
+- Do not miss closing tag at the end of the JSON output.
+- Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format.
+
+Result
+Output the final list of JSON objects, wrapped in ... XML tags. Make sure to close the tag properly."""
diff --git a/crawl4ai/ssl_certificate.py b/crawl4ai/ssl_certificate.py
new file mode 100644
index 0000000000000000000000000000000000000000..97529e3e1d4aa358003b4cca3b7c1ee8929bd852
--- /dev/null
+++ b/crawl4ai/ssl_certificate.py
@@ -0,0 +1,181 @@
+"""SSL Certificate class for handling certificate operations."""
+
+import ssl
+import socket
+import base64
+import json
+from typing import Dict, Any, Optional
+from urllib.parse import urlparse
+import OpenSSL.crypto
+from pathlib import Path
+
+
+class SSLCertificate:
+ """
+ A class representing an SSL certificate with methods to export in various formats.
+
+ Attributes:
+ cert_info (Dict[str, Any]): The certificate information.
+
+ Methods:
+ from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL.
+ from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file.
+ from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data.
+ export_as_pem() -> str: Export the certificate as PEM format.
+ export_as_der() -> bytes: Export the certificate as DER format.
+ export_as_json() -> Dict[str, Any]: Export the certificate as JSON format.
+ export_as_text() -> str: Export the certificate as text format.
+ """
+ def __init__(self, cert_info: Dict[str, Any]):
+ self._cert_info = self._decode_cert_data(cert_info)
+
+ @staticmethod
+ def from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']:
+ """
+ Create SSLCertificate instance from a URL.
+
+ Args:
+ url (str): URL of the website.
+ timeout (int): Timeout for the connection (default: 10).
+
+ Returns:
+ Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise.
+ """
+ try:
+ hostname = urlparse(url).netloc
+ if ':' in hostname:
+ hostname = hostname.split(':')[0]
+
+ context = ssl.create_default_context()
+ with socket.create_connection((hostname, 443), timeout=timeout) as sock:
+ with context.wrap_socket(sock, server_hostname=hostname) as ssock:
+ cert_binary = ssock.getpeercert(binary_form=True)
+ x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_ASN1, cert_binary)
+
+ cert_info = {
+ "subject": dict(x509.get_subject().get_components()),
+ "issuer": dict(x509.get_issuer().get_components()),
+ "version": x509.get_version(),
+ "serial_number": hex(x509.get_serial_number()),
+ "not_before": x509.get_notBefore(),
+ "not_after": x509.get_notAfter(),
+ "fingerprint": x509.digest("sha256").hex(),
+ "signature_algorithm": x509.get_signature_algorithm(),
+ "raw_cert": base64.b64encode(cert_binary)
+ }
+
+ # Add extensions
+ extensions = []
+ for i in range(x509.get_extension_count()):
+ ext = x509.get_extension(i)
+ extensions.append({
+ "name": ext.get_short_name(),
+ "value": str(ext)
+ })
+ cert_info["extensions"] = extensions
+
+ return SSLCertificate(cert_info)
+
+ except Exception as e:
+ return None
+
+ @staticmethod
+ def _decode_cert_data(data: Any) -> Any:
+ """Helper method to decode bytes in certificate data."""
+ if isinstance(data, bytes):
+ return data.decode('utf-8')
+ elif isinstance(data, dict):
+ return {
+ (k.decode('utf-8') if isinstance(k, bytes) else k): SSLCertificate._decode_cert_data(v)
+ for k, v in data.items()
+ }
+ elif isinstance(data, list):
+ return [SSLCertificate._decode_cert_data(item) for item in data]
+ return data
+
+ def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
+ """
+ Export certificate as JSON.
+
+ Args:
+ filepath (Optional[str]): Path to save the JSON file (default: None).
+
+ Returns:
+ Optional[str]: JSON string if successful, None otherwise.
+ """
+ json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
+ if filepath:
+ Path(filepath).write_text(json_str, encoding='utf-8')
+ return None
+ return json_str
+
+ def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
+ """
+ Export certificate as PEM.
+
+ Args:
+ filepath (Optional[str]): Path to save the PEM file (default: None).
+
+ Returns:
+ Optional[str]: PEM string if successful, None otherwise.
+ """
+ try:
+ x509 = OpenSSL.crypto.load_certificate(
+ OpenSSL.crypto.FILETYPE_ASN1,
+ base64.b64decode(self._cert_info['raw_cert'])
+ )
+ pem_data = OpenSSL.crypto.dump_certificate(
+ OpenSSL.crypto.FILETYPE_PEM,
+ x509
+ ).decode('utf-8')
+
+ if filepath:
+ Path(filepath).write_text(pem_data, encoding='utf-8')
+ return None
+ return pem_data
+ except Exception as e:
+ return None
+
+ def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
+ """
+ Export certificate as DER.
+
+ Args:
+ filepath (Optional[str]): Path to save the DER file (default: None).
+
+ Returns:
+ Optional[bytes]: DER bytes if successful, None otherwise.
+ """
+ try:
+ der_data = base64.b64decode(self._cert_info['raw_cert'])
+ if filepath:
+ Path(filepath).write_bytes(der_data)
+ return None
+ return der_data
+ except Exception:
+ return None
+
+ @property
+ def issuer(self) -> Dict[str, str]:
+ """Get certificate issuer information."""
+ return self._cert_info.get('issuer', {})
+
+ @property
+ def subject(self) -> Dict[str, str]:
+ """Get certificate subject information."""
+ return self._cert_info.get('subject', {})
+
+ @property
+ def valid_from(self) -> str:
+ """Get certificate validity start date."""
+ return self._cert_info.get('not_before', '')
+
+ @property
+ def valid_until(self) -> str:
+ """Get certificate validity end date."""
+ return self._cert_info.get('not_after', '')
+
+ @property
+ def fingerprint(self) -> str:
+ """Get certificate fingerprint."""
+ return self._cert_info.get('fingerprint', '')
diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..6679bb1b25a2f020b403eb230a8a640194062f19
--- /dev/null
+++ b/crawl4ai/user_agent_generator.py
@@ -0,0 +1,305 @@
+import random
+from typing import Optional, Literal, List, Dict, Tuple
+import re
+
+
+class UserAgentGenerator:
+ """
+ Generate random user agents with specified constraints.
+
+ Attributes:
+ desktop_platforms (dict): A dictionary of possible desktop platforms and their corresponding user agent strings.
+ mobile_platforms (dict): A dictionary of possible mobile platforms and their corresponding user agent strings.
+ browser_combinations (dict): A dictionary of possible browser combinations and their corresponding user agent strings.
+ rendering_engines (dict): A dictionary of possible rendering engines and their corresponding user agent strings.
+ chrome_versions (list): A list of possible Chrome browser versions.
+ firefox_versions (list): A list of possible Firefox browser versions.
+ edge_versions (list): A list of possible Edge browser versions.
+ safari_versions (list): A list of possible Safari browser versions.
+ ios_versions (list): A list of possible iOS browser versions.
+ android_versions (list): A list of possible Android browser versions.
+
+ Methods:
+ generate_user_agent(
+ platform: Literal["desktop", "mobile"] = "desktop",
+ browser: str = "chrome",
+ rendering_engine: str = "chrome_webkit",
+ chrome_version: Optional[str] = None,
+ firefox_version: Optional[str] = None,
+ edge_version: Optional[str] = None,
+ safari_version: Optional[str] = None,
+ ios_version: Optional[str] = None,
+ android_version: Optional[str] = None
+ ): Generates a random user agent string based on the specified parameters.
+ """
+ def __init__(self):
+ # Previous platform definitions remain the same...
+ self.desktop_platforms = {
+ "windows": {
+ "10_64": "(Windows NT 10.0; Win64; x64)",
+ "10_32": "(Windows NT 10.0; WOW64)",
+ },
+ "macos": {
+ "intel": "(Macintosh; Intel Mac OS X 10_15_7)",
+ "newer": "(Macintosh; Intel Mac OS X 10.15; rv:109.0)",
+ },
+ "linux": {
+ "generic": "(X11; Linux x86_64)",
+ "ubuntu": "(X11; Ubuntu; Linux x86_64)",
+ "chrome_os": "(X11; CrOS x86_64 14541.0.0)",
+ }
+ }
+
+ self.mobile_platforms = {
+ "android": {
+ "samsung": "(Linux; Android 13; SM-S901B)",
+ "pixel": "(Linux; Android 12; Pixel 6)",
+ "oneplus": "(Linux; Android 13; OnePlus 9 Pro)",
+ "xiaomi": "(Linux; Android 12; M2102J20SG)",
+ },
+ "ios": {
+ "iphone": "(iPhone; CPU iPhone OS 16_5 like Mac OS X)",
+ "ipad": "(iPad; CPU OS 16_5 like Mac OS X)",
+ }
+ }
+
+ # Browser Combinations
+ self.browser_combinations = {
+ 1: [
+ ["chrome"],
+ ["firefox"],
+ ["safari"],
+ ["edge"]
+ ],
+ 2: [
+ ["gecko", "firefox"],
+ ["chrome", "safari"],
+ ["webkit", "safari"]
+ ],
+ 3: [
+ ["chrome", "safari", "edge"],
+ ["webkit", "chrome", "safari"]
+ ]
+ }
+
+ # Rendering Engines with versions
+ self.rendering_engines = {
+ "chrome_webkit": "AppleWebKit/537.36",
+ "safari_webkit": "AppleWebKit/605.1.15",
+ "gecko": [ # Added Gecko versions
+ "Gecko/20100101",
+ "Gecko/20100101", # Firefox usually uses this constant version
+ "Gecko/2010010",
+ ]
+ }
+
+ # Browser Versions
+ self.chrome_versions = [
+ "Chrome/119.0.6045.199",
+ "Chrome/118.0.5993.117",
+ "Chrome/117.0.5938.149",
+ "Chrome/116.0.5845.187",
+ "Chrome/115.0.5790.171",
+ ]
+
+ self.edge_versions = [
+ "Edg/119.0.2151.97",
+ "Edg/118.0.2088.76",
+ "Edg/117.0.2045.47",
+ "Edg/116.0.1938.81",
+ "Edg/115.0.1901.203",
+ ]
+
+ self.safari_versions = [
+ "Safari/537.36", # For Chrome-based
+ "Safari/605.1.15",
+ "Safari/604.1",
+ "Safari/602.1",
+ "Safari/601.5.17",
+ ]
+
+ # Added Firefox versions
+ self.firefox_versions = [
+ "Firefox/119.0",
+ "Firefox/118.0.2",
+ "Firefox/117.0.1",
+ "Firefox/116.0",
+ "Firefox/115.0.3",
+ "Firefox/114.0.2",
+ "Firefox/113.0.1",
+ "Firefox/112.0",
+ "Firefox/111.0.1",
+ "Firefox/110.0",
+ ]
+
+ def get_browser_stack(self, num_browsers: int = 1) -> List[str]:
+ """
+ Get a valid combination of browser versions.
+
+ How it works:
+ 1. Check if the number of browsers is supported.
+ 2. Randomly choose a combination of browsers.
+ 3. Iterate through the combination and add browser versions.
+ 4. Return the browser stack.
+
+ Args:
+ num_browsers: Number of browser specifications (1-3)
+
+ Returns:
+ List[str]: A list of browser versions.
+ """
+ if num_browsers not in self.browser_combinations:
+ raise ValueError(f"Unsupported number of browsers: {num_browsers}")
+
+ combination = random.choice(self.browser_combinations[num_browsers])
+ browser_stack = []
+
+ for browser in combination:
+ if browser == "chrome":
+ browser_stack.append(random.choice(self.chrome_versions))
+ elif browser == "firefox":
+ browser_stack.append(random.choice(self.firefox_versions))
+ elif browser == "safari":
+ browser_stack.append(random.choice(self.safari_versions))
+ elif browser == "edge":
+ browser_stack.append(random.choice(self.edge_versions))
+ elif browser == "gecko":
+ browser_stack.append(random.choice(self.rendering_engines["gecko"]))
+ elif browser == "webkit":
+ browser_stack.append(self.rendering_engines["chrome_webkit"])
+
+ return browser_stack
+
+ def generate(self,
+ device_type: Optional[Literal['desktop', 'mobile']] = None,
+ os_type: Optional[str] = None,
+ device_brand: Optional[str] = None,
+ browser_type: Optional[Literal['chrome', 'edge', 'safari', 'firefox']] = None,
+ num_browsers: int = 3) -> str:
+ """
+ Generate a random user agent with specified constraints.
+
+ Args:
+ device_type: 'desktop' or 'mobile'
+ os_type: 'windows', 'macos', 'linux', 'android', 'ios'
+ device_brand: Specific device brand
+ browser_type: 'chrome', 'edge', 'safari', or 'firefox'
+ num_browsers: Number of browser specifications (1-3)
+ """
+ # Get platform string
+ platform = self.get_random_platform(device_type, os_type, device_brand)
+
+ # Start with Mozilla
+ components = ["Mozilla/5.0", platform]
+
+ # Add browser stack
+ browser_stack = self.get_browser_stack(num_browsers)
+
+ # Add appropriate legacy token based on browser stack
+ if "Firefox" in str(browser_stack):
+ components.append(random.choice(self.rendering_engines["gecko"]))
+ elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack):
+ components.append(self.rendering_engines["chrome_webkit"])
+ components.append("(KHTML, like Gecko)")
+
+ # Add browser versions
+ components.extend(browser_stack)
+
+ return " ".join(components)
+
+ def generate_with_client_hints(self, **kwargs) -> Tuple[str, str]:
+ """Generate both user agent and matching client hints"""
+ user_agent = self.generate(**kwargs)
+ client_hints = self.generate_client_hints(user_agent)
+ return user_agent, client_hints
+
+ def get_random_platform(self, device_type, os_type, device_brand):
+ """Helper method to get random platform based on constraints"""
+ platforms = self.desktop_platforms if device_type == 'desktop' else \
+ self.mobile_platforms if device_type == 'mobile' else \
+ {**self.desktop_platforms, **self.mobile_platforms}
+
+ if os_type:
+ for platform_group in [self.desktop_platforms, self.mobile_platforms]:
+ if os_type in platform_group:
+ platforms = {os_type: platform_group[os_type]}
+ break
+
+ os_key = random.choice(list(platforms.keys()))
+ if device_brand and device_brand in platforms[os_key]:
+ return platforms[os_key][device_brand]
+ return random.choice(list(platforms[os_key].values()))
+
+ def parse_user_agent(self, user_agent: str) -> Dict[str, str]:
+ """Parse a user agent string to extract browser and version information"""
+ browsers = {
+ 'chrome': r'Chrome/(\d+)',
+ 'edge': r'Edg/(\d+)',
+ 'safari': r'Version/(\d+)',
+ 'firefox': r'Firefox/(\d+)'
+ }
+
+ result = {}
+ for browser, pattern in browsers.items():
+ match = re.search(pattern, user_agent)
+ if match:
+ result[browser] = match.group(1)
+
+ return result
+
+ def generate_client_hints(self, user_agent: str) -> str:
+ """Generate Sec-CH-UA header value based on user agent string"""
+ browsers = self.parse_user_agent(user_agent)
+
+ # Client hints components
+ hints = []
+
+ # Handle different browser combinations
+ if 'chrome' in browsers:
+ hints.append(f'"Chromium";v="{browsers["chrome"]}"')
+ hints.append('"Not_A Brand";v="8"')
+
+ if 'edge' in browsers:
+ hints.append(f'"Microsoft Edge";v="{browsers["edge"]}"')
+ else:
+ hints.append(f'"Google Chrome";v="{browsers["chrome"]}"')
+
+ elif 'firefox' in browsers:
+ # Firefox doesn't typically send Sec-CH-UA
+ return '""'
+
+ elif 'safari' in browsers:
+ # Safari's format for client hints
+ hints.append(f'"Safari";v="{browsers["safari"]}"')
+ hints.append('"Not_A Brand";v="8"')
+
+ return ', '.join(hints)
+
+# Example usage:
+if __name__ == "__main__":
+ generator = UserAgentGenerator()
+ print(generator.generate())
+
+ print("\nSingle browser (Chrome):")
+ print(generator.generate(num_browsers=1, browser_type='chrome'))
+
+ print("\nTwo browsers (Gecko/Firefox):")
+ print(generator.generate(num_browsers=2))
+
+ print("\nThree browsers (Chrome/Safari/Edge):")
+ print(generator.generate(num_browsers=3))
+
+ print("\nFirefox on Linux:")
+ print(generator.generate(
+ device_type='desktop',
+ os_type='linux',
+ browser_type='firefox',
+ num_browsers=2
+ ))
+
+ print("\nChrome/Safari/Edge on Windows:")
+ print(generator.generate(
+ device_type='desktop',
+ os_type='windows',
+ num_browsers=3
+ ))
\ No newline at end of file
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fd7429f19df0ac1700dbac7b760cdc125697e14
--- /dev/null
+++ b/crawl4ai/utils.py
@@ -0,0 +1,1660 @@
+import time
+from urllib.parse import urlparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
+import json
+import html
+import re
+import os
+import platform
+from .prompts import PROMPT_EXTRACT_BLOCKS
+from .config import *
+from pathlib import Path
+from typing import Dict, Any
+from urllib.parse import urljoin
+import requests
+from requests.exceptions import InvalidSchema
+from typing import Optional, Tuple, Dict, Any
+import xxhash
+from colorama import Fore, Style, init
+import textwrap
+import cProfile
+import pstats
+from functools import wraps
+import asyncio
+
+
+class InvalidCSSSelectorError(Exception):
+ pass
+
+def create_box_message(message: str, type: str = "info", width: int = 120, add_newlines: bool = True, double_line: bool = False) -> str:
+ """
+ Create a styled message box with colored borders and formatted text.
+
+ How it works:
+ 1. Determines box style and colors based on the message type (e.g., info, warning).
+ 2. Wraps text to fit within the specified width.
+ 3. Constructs a box using characters (single or double lines) with appropriate formatting.
+ 4. Adds optional newlines before and after the box.
+
+ Args:
+ message (str): The message to display inside the box.
+ type (str): Type of the message (e.g., "info", "warning", "error", "success"). Defaults to "info".
+ width (int): Width of the box. Defaults to 120.
+ add_newlines (bool): Whether to add newlines before and after the box. Defaults to True.
+ double_line (bool): Whether to use double lines for the box border. Defaults to False.
+
+ Returns:
+ str: A formatted string containing the styled message box.
+ """
+
+ init()
+
+ # Define border and text colors for different types
+ styles = {
+ "warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"),
+ "info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"),
+ "success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"),
+ "error": (Fore.RED, Fore.LIGHTRED_EX, "×"),
+ }
+
+ border_color, text_color, prefix = styles.get(type.lower(), styles["info"])
+
+ # Define box characters based on line style
+ box_chars = {
+ "single": ("─", "│", "┌", "┐", "└", "┘"),
+ "double": ("═", "║", "╔", "╗", "╚", "╝")
+ }
+ line_style = "double" if double_line else "single"
+ h_line, v_line, tl, tr, bl, br = box_chars[line_style]
+
+ # Process lines with lighter text color
+ formatted_lines = []
+ raw_lines = message.split('\n')
+
+ if raw_lines:
+ first_line = f"{prefix} {raw_lines[0].strip()}"
+ wrapped_first = textwrap.fill(first_line, width=width-4)
+ formatted_lines.extend(wrapped_first.split('\n'))
+
+ for line in raw_lines[1:]:
+ if line.strip():
+ wrapped = textwrap.fill(f" {line.strip()}", width=width-4)
+ formatted_lines.extend(wrapped.split('\n'))
+ else:
+ formatted_lines.append("")
+
+ # Create the box with colored borders and lighter text
+ horizontal_line = h_line * (width - 1)
+ box = [
+ f"{border_color}{tl}{horizontal_line}{tr}",
+ *[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines],
+ f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}"
+ ]
+
+ result = "\n".join(box)
+ if add_newlines:
+ result = f"\n{result}\n"
+
+ return result
+
+def calculate_semaphore_count():
+ """
+ Calculate the optimal semaphore count based on system resources.
+
+ How it works:
+ 1. Determines the number of CPU cores and total system memory.
+ 2. Sets a base count as half of the available CPU cores.
+ 3. Limits the count based on memory, assuming 2GB per semaphore instance.
+ 4. Returns the minimum value between CPU and memory-based limits.
+
+ Returns:
+ int: The calculated semaphore count.
+ """
+
+ cpu_count = os.cpu_count()
+ memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB
+ base_count = max(1, cpu_count // 2)
+ memory_based_cap = int(memory_gb / 2) # Assume 2GB per instance
+ return min(base_count, memory_based_cap)
+
+def get_system_memory():
+ """
+ Get the total system memory in bytes.
+
+ How it works:
+ 1. Detects the operating system.
+ 2. Reads memory information from system-specific commands or files.
+ 3. Converts the memory to bytes for uniformity.
+
+ Returns:
+ int: The total system memory in bytes.
+
+ Raises:
+ OSError: If the operating system is unsupported.
+ """
+
+ system = platform.system()
+ if system == "Linux":
+ with open('/proc/meminfo', 'r') as mem:
+ for line in mem:
+ if line.startswith('MemTotal:'):
+ return int(line.split()[1]) * 1024 # Convert KB to bytes
+ elif system == "Darwin": # macOS
+ import subprocess
+ output = subprocess.check_output(['sysctl', '-n', 'hw.memsize']).decode('utf-8')
+ return int(output.strip())
+ elif system == "Windows":
+ import ctypes
+ kernel32 = ctypes.windll.kernel32
+ c_ulonglong = ctypes.c_ulonglong
+ class MEMORYSTATUSEX(ctypes.Structure):
+ _fields_ = [
+ ('dwLength', ctypes.c_ulong),
+ ('dwMemoryLoad', ctypes.c_ulong),
+ ('ullTotalPhys', c_ulonglong),
+ ('ullAvailPhys', c_ulonglong),
+ ('ullTotalPageFile', c_ulonglong),
+ ('ullAvailPageFile', c_ulonglong),
+ ('ullTotalVirtual', c_ulonglong),
+ ('ullAvailVirtual', c_ulonglong),
+ ('ullAvailExtendedVirtual', c_ulonglong),
+ ]
+ memoryStatus = MEMORYSTATUSEX()
+ memoryStatus.dwLength = ctypes.sizeof(MEMORYSTATUSEX)
+ kernel32.GlobalMemoryStatusEx(ctypes.byref(memoryStatus))
+ return memoryStatus.ullTotalPhys
+ else:
+ raise OSError("Unsupported operating system")
+
+def get_home_folder():
+ """
+ Get or create the home folder for Crawl4AI configuration and cache.
+
+ How it works:
+ 1. Uses environment variables or defaults to the user's home directory.
+ 2. Creates `.crawl4ai` and its subdirectories (`cache`, `models`) if they don't exist.
+ 3. Returns the path to the home folder.
+
+ Returns:
+ str: The path to the Crawl4AI home folder.
+ """
+
+ home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai")
+ os.makedirs(home_folder, exist_ok=True)
+ os.makedirs(f"{home_folder}/cache", exist_ok=True)
+ os.makedirs(f"{home_folder}/models", exist_ok=True)
+ return home_folder
+
+def beautify_html(escaped_html):
+ """
+ Beautifies an escaped HTML string.
+
+ Parameters:
+ escaped_html (str): A string containing escaped HTML.
+
+ Returns:
+ str: A beautifully formatted HTML string.
+ """
+ # Unescape the HTML string
+ unescaped_html = html.unescape(escaped_html)
+
+ # Use BeautifulSoup to parse and prettify the HTML
+ soup = BeautifulSoup(unescaped_html, 'html.parser')
+ pretty_html = soup.prettify()
+
+ return pretty_html
+
+def split_and_parse_json_objects(json_string):
+ """
+ Splits a JSON string which is a list of objects and tries to parse each object.
+
+ Parameters:
+ json_string (str): A string representation of a list of JSON objects, e.g., '[{...}, {...}, ...]'.
+
+ Returns:
+ tuple: A tuple containing two lists:
+ - First list contains all successfully parsed JSON objects.
+ - Second list contains the string representations of all segments that couldn't be parsed.
+ """
+ # Trim the leading '[' and trailing ']'
+ if json_string.startswith('[') and json_string.endswith(']'):
+ json_string = json_string[1:-1].strip()
+
+ # Split the string into segments that look like individual JSON objects
+ segments = []
+ depth = 0
+ start_index = 0
+
+ for i, char in enumerate(json_string):
+ if char == '{':
+ if depth == 0:
+ start_index = i
+ depth += 1
+ elif char == '}':
+ depth -= 1
+ if depth == 0:
+ segments.append(json_string[start_index:i+1])
+
+ # Try parsing each segment
+ parsed_objects = []
+ unparsed_segments = []
+
+ for segment in segments:
+ try:
+ obj = json.loads(segment)
+ parsed_objects.append(obj)
+ except json.JSONDecodeError:
+ unparsed_segments.append(segment)
+
+ return parsed_objects, unparsed_segments
+
+def sanitize_html(html):
+ """
+ Sanitize an HTML string by escaping quotes.
+
+ How it works:
+ 1. Replaces all unwanted and special characters with an empty string.
+ 2. Escapes double and single quotes for safe usage.
+
+ Args:
+ html (str): The HTML string to sanitize.
+
+ Returns:
+ str: The sanitized HTML string.
+ """
+
+ # Replace all unwanted and special characters with an empty string
+ sanitized_html = html
+ # sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html)
+
+ # Escape all double and single quotes
+ sanitized_html = sanitized_html.replace('"', '\\"').replace("'", "\\'")
+
+ return sanitized_html
+
+def sanitize_input_encode(text: str) -> str:
+ """Sanitize input to handle potential encoding issues."""
+ try:
+ try:
+ if not text:
+ return ''
+ # Attempt to encode and decode as UTF-8 to handle potential encoding issues
+ return text.encode('utf-8', errors='ignore').decode('utf-8')
+ except UnicodeEncodeError as e:
+ print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
+ # Fall back to ASCII if UTF-8 fails
+ return text.encode('ascii', errors='ignore').decode('ascii')
+ except Exception as e:
+ raise ValueError(f"Error sanitizing input: {str(e)}") from e
+
+def escape_json_string(s):
+ """
+ Escapes characters in a string to be JSON safe.
+
+ Parameters:
+ s (str): The input string to be escaped.
+
+ Returns:
+ str: The escaped string, safe for JSON encoding.
+ """
+ # Replace problematic backslash first
+ s = s.replace('\\', '\\\\')
+
+ # Replace the double quote
+ s = s.replace('"', '\\"')
+
+ # Escape control characters
+ s = s.replace('\b', '\\b')
+ s = s.replace('\f', '\\f')
+ s = s.replace('\n', '\\n')
+ s = s.replace('\r', '\\r')
+ s = s.replace('\t', '\\t')
+
+ # Additional problematic characters
+ # Unicode control characters
+ s = re.sub(r'[\x00-\x1f\x7f-\x9f]', lambda x: '\\u{:04x}'.format(ord(x.group())), s)
+
+ return s
+
+def replace_inline_tags(soup, tags, only_text=False):
+ """
+ Replace inline HTML tags with Markdown-style equivalents.
+
+ How it works:
+ 1. Maps specific tags (e.g., , ) to Markdown syntax.
+ 2. Finds and replaces all occurrences of these tags in the provided BeautifulSoup object.
+ 3. Optionally replaces tags with their text content only.
+
+ Args:
+ soup (BeautifulSoup): Parsed HTML content.
+ tags (List[str]): List of tags to replace.
+ only_text (bool): Whether to replace tags with plain text. Defaults to False.
+
+ Returns:
+ BeautifulSoup: Updated BeautifulSoup object with replaced tags.
+ """
+
+ tag_replacements = {
+ 'b': lambda tag: f"**{tag.text}**",
+ 'i': lambda tag: f"*{tag.text}*",
+ 'u': lambda tag: f"__{tag.text}__",
+ 'span': lambda tag: f"{tag.text}",
+ 'del': lambda tag: f"~~{tag.text}~~",
+ 'ins': lambda tag: f"++{tag.text}++",
+ 'sub': lambda tag: f"~{tag.text}~",
+ 'sup': lambda tag: f"^^{tag.text}^^",
+ 'strong': lambda tag: f"**{tag.text}**",
+ 'em': lambda tag: f"*{tag.text}*",
+ 'code': lambda tag: f"`{tag.text}`",
+ 'kbd': lambda tag: f"`{tag.text}`",
+ 'var': lambda tag: f"_{tag.text}_",
+ 's': lambda tag: f"~~{tag.text}~~",
+ 'q': lambda tag: f'"{tag.text}"',
+ 'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})",
+ 'cite': lambda tag: f"_{tag.text}_",
+ 'dfn': lambda tag: f"_{tag.text}_",
+ 'time': lambda tag: f"{tag.text}",
+ 'small': lambda tag: f"{tag.text} ",
+ 'mark': lambda tag: f"=={tag.text}=="
+ }
+
+ replacement_data = [(tag, tag_replacements.get(tag, lambda t: t.text)) for tag in tags]
+
+ for tag_name, replacement_func in replacement_data:
+ for tag in soup.find_all(tag_name):
+ replacement_text = tag.text if only_text else replacement_func(tag)
+ tag.replace_with(replacement_text)
+
+ return soup
+
+ # for tag_name in tags:
+ # for tag in soup.find_all(tag_name):
+ # if not only_text:
+ # replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag)
+ # tag.replace_with(replacement_text)
+ # else:
+ # tag.replace_with(tag.text)
+
+ # return soup
+
+def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs):
+ """
+ Extract structured content, media, and links from website HTML.
+
+ How it works:
+ 1. Parses the HTML content using BeautifulSoup.
+ 2. Extracts internal/external links and media (images, videos, audios).
+ 3. Cleans the content by removing unwanted tags and attributes.
+ 4. Converts cleaned HTML to Markdown.
+ 5. Collects metadata and returns the extracted information.
+
+ Args:
+ url (str): The website URL.
+ html (str): The HTML content of the website.
+ word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
+ css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.
+
+ Returns:
+ Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
+ """
+
+ try:
+ if not html:
+ return None
+ # Parse HTML content with BeautifulSoup
+ soup = BeautifulSoup(html, 'html.parser')
+
+ # Get the content within the tag
+ body = soup.body
+
+ # If css_selector is provided, extract content based on the selector
+ if css_selector:
+ selected_elements = body.select(css_selector)
+ if not selected_elements:
+ raise InvalidCSSSelectorError(f"Invalid CSS selector , No elements found for CSS selector: {css_selector}")
+ div_tag = soup.new_tag('div')
+ for el in selected_elements:
+ div_tag.append(el)
+ body = div_tag
+
+ links = {
+ 'internal': [],
+ 'external': []
+ }
+
+ # Extract all internal and external links
+ for a in body.find_all('a', href=True):
+ href = a['href']
+ url_base = url.split('/')[2]
+ if href.startswith('http') and url_base not in href:
+ links['external'].append({
+ 'href': href,
+ 'text': a.get_text()
+ })
+ else:
+ links['internal'].append(
+ {
+ 'href': href,
+ 'text': a.get_text()
+ }
+ )
+
+ # Remove script, style, and other tags that don't carry useful content from body
+ for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
+ tag.decompose()
+
+ # Remove all attributes from remaining tags in body, except for img tags
+ for tag in body.find_all():
+ if tag.name != 'img':
+ tag.attrs = {}
+
+ # Extract all img tgas int0 [{src: '', alt: ''}]
+ media = {
+ 'images': [],
+ 'videos': [],
+ 'audios': []
+ }
+ for img in body.find_all('img'):
+ media['images'].append({
+ 'src': img.get('src'),
+ 'alt': img.get('alt'),
+ "type": "image"
+ })
+
+ # Extract all video tags into [{src: '', alt: ''}]
+ for video in body.find_all('video'):
+ media['videos'].append({
+ 'src': video.get('src'),
+ 'alt': video.get('alt'),
+ "type": "video"
+ })
+
+ # Extract all audio tags into [{src: '', alt: ''}]
+ for audio in body.find_all('audio'):
+ media['audios'].append({
+ 'src': audio.get('src'),
+ 'alt': audio.get('alt'),
+ "type": "audio"
+ })
+
+ # Replace images with their alt text or remove them if no alt text is available
+ for img in body.find_all('img'):
+ alt_text = img.get('alt')
+ if alt_text:
+ img.replace_with(soup.new_string(alt_text))
+ else:
+ img.decompose()
+
+
+ # Create a function that replace content of all"pre" tag with its inner text
+ def replace_pre_tags_with_text(node):
+ for child in node.find_all('pre'):
+ # set child inner html to its text
+ child.string = child.get_text()
+ return node
+
+ # Replace all "pre" tags with their inner text
+ body = replace_pre_tags_with_text(body)
+
+ # Replace inline tags with their text content
+ body = replace_inline_tags(
+ body,
+ ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'],
+ only_text=kwargs.get('only_text', False)
+ )
+
+ # Recursively remove empty elements, their parent elements, and elements with word count below threshold
+ def remove_empty_and_low_word_count_elements(node, word_count_threshold):
+ for child in node.contents:
+ if isinstance(child, element.Tag):
+ remove_empty_and_low_word_count_elements(child, word_count_threshold)
+ word_count = len(child.get_text(strip=True).split())
+ if (len(child.contents) == 0 and not child.get_text(strip=True)) or word_count < word_count_threshold:
+ child.decompose()
+ return node
+
+ body = remove_empty_and_low_word_count_elements(body, word_count_threshold)
+
+ def remove_small_text_tags(body: Tag, word_count_threshold: int = MIN_WORD_THRESHOLD):
+ # We'll use a list to collect all tags that don't meet the word count requirement
+ tags_to_remove = []
+
+ # Traverse all tags in the body
+ for tag in body.find_all(True): # True here means all tags
+ # Check if the tag contains text and if it's not just whitespace
+ if tag.string and tag.string.strip():
+ # Split the text by spaces and count the words
+ word_count = len(tag.string.strip().split())
+ # If the word count is less than the threshold, mark the tag for removal
+ if word_count < word_count_threshold:
+ tags_to_remove.append(tag)
+
+ # Remove all marked tags from the tree
+ for tag in tags_to_remove:
+ tag.decompose() # or tag.extract() to remove and get the element
+
+ return body
+
+
+ # Remove small text tags
+ body = remove_small_text_tags(body, word_count_threshold)
+
+ def is_empty_or_whitespace(tag: Tag):
+ if isinstance(tag, NavigableString):
+ return not tag.strip()
+ # Check if the tag itself is empty or all its children are empty/whitespace
+ if not tag.contents:
+ return True
+ return all(is_empty_or_whitespace(child) for child in tag.contents)
+
+ def remove_empty_tags(body: Tag):
+ # Continue processing until no more changes are made
+ changes = True
+ while changes:
+ changes = False
+ # Collect all tags that are empty or contain only whitespace
+ empty_tags = [tag for tag in body.find_all(True) if is_empty_or_whitespace(tag)]
+ for tag in empty_tags:
+ # If a tag is empty, decompose it
+ tag.decompose()
+ changes = True # Mark that a change was made
+
+ return body
+
+
+ # Remove empty tags
+ body = remove_empty_tags(body)
+
+ # Flatten nested elements with only one child of the same type
+ def flatten_nested_elements(node):
+ for child in node.contents:
+ if isinstance(child, element.Tag):
+ flatten_nested_elements(child)
+ if len(child.contents) == 1 and child.contents[0].name == child.name:
+ # print('Flattening:', child.name)
+ child_content = child.contents[0]
+ child.replace_with(child_content)
+
+ return node
+
+ body = flatten_nested_elements(body)
+
+
+
+ # Remove comments
+ for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+ comment.extract()
+
+ # Remove consecutive empty newlines and replace multiple spaces with a single space
+ cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
+
+ # Sanitize the cleaned HTML content
+ cleaned_html = sanitize_html(cleaned_html)
+ # sanitized_html = escape_json_string(cleaned_html)
+
+ # Convert cleaned HTML to Markdown
+ h = html2text.HTML2Text()
+ h = CustomHTML2Text()
+ h.ignore_links = True
+ markdown = h.handle(cleaned_html)
+ markdown = markdown.replace(' ```', '```')
+
+ try:
+ meta = extract_metadata(html, soup)
+ except Exception as e:
+ print('Error extracting metadata:', str(e))
+ meta = {}
+
+
+ # Return the Markdown content
+ return{
+ 'markdown': markdown,
+ 'cleaned_html': cleaned_html,
+ 'success': True,
+ 'media': media,
+ 'links': links,
+ 'metadata': meta
+ }
+
+ except Exception as e:
+ print('Error processing HTML content:', str(e))
+ raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
+
+def get_content_of_website_optimized(url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
+ if not html:
+ return None
+
+ soup = BeautifulSoup(html, 'html.parser')
+ body = soup.body
+
+ image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
+
+ for tag in kwargs.get('excluded_tags', []) or []:
+ for el in body.select(tag):
+ el.decompose()
+
+ if css_selector:
+ selected_elements = body.select(css_selector)
+ if not selected_elements:
+ raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
+ body = soup.new_tag('div')
+ for el in selected_elements:
+ body.append(el)
+
+ links = {'internal': [], 'external': []}
+ media = {'images': [], 'videos': [], 'audios': []}
+
+ # Extract meaningful text for media files from closest parent
+ def find_closest_parent_with_useful_text(tag):
+ current_tag = tag
+ while current_tag:
+ current_tag = current_tag.parent
+ # Get the text content from the parent tag
+ if current_tag:
+ text_content = current_tag.get_text(separator=' ',strip=True)
+ # Check if the text content has at least word_count_threshold
+ if len(text_content.split()) >= image_description_min_word_threshold:
+ return text_content
+ return None
+
+ def process_image(img, url, index, total_images):
+ #Check if an image has valid display and inside undesired html elements
+ def is_valid_image(img, parent, parent_classes):
+ style = img.get('style', '')
+ src = img.get('src', '')
+ classes_to_check = ['button', 'icon', 'logo']
+ tags_to_check = ['button', 'input']
+ return all([
+ 'display:none' not in style,
+ src,
+ not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check),
+ parent.name not in tags_to_check
+ ])
+
+ #Score an image for it's usefulness
+ def score_image_for_usefulness(img, base_url, index, images_count):
+ # Function to parse image height/width value and units
+ def parse_dimension(dimension):
+ if dimension:
+ match = re.match(r"(\d+)(\D*)", dimension)
+ if match:
+ number = int(match.group(1))
+ unit = match.group(2) or 'px' # Default unit is 'px' if not specified
+ return number, unit
+ return None, None
+
+ # Fetch image file metadata to extract size and extension
+ def fetch_image_file_size(img, base_url):
+ #If src is relative path construct full URL, if not it may be CDN URL
+ img_url = urljoin(base_url,img.get('src'))
+ try:
+ response = requests.head(img_url)
+ if response.status_code == 200:
+ return response.headers.get('Content-Length',None)
+ else:
+ print(f"Failed to retrieve file size for {img_url}")
+ return None
+ except InvalidSchema as e:
+ return None
+ finally:
+ return
+
+ image_height = img.get('height')
+ height_value, height_unit = parse_dimension(image_height)
+ image_width = img.get('width')
+ width_value, width_unit = parse_dimension(image_width)
+ image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
+ image_format = os.path.splitext(img.get('src',''))[1].lower()
+ # Remove . from format
+ image_format = image_format.strip('.')
+ score = 0
+ if height_value:
+ if height_unit == 'px' and height_value > 150:
+ score += 1
+ if height_unit in ['%','vh','vmin','vmax'] and height_value >30:
+ score += 1
+ if width_value:
+ if width_unit == 'px' and width_value > 150:
+ score += 1
+ if width_unit in ['%','vh','vmin','vmax'] and width_value >30:
+ score += 1
+ if image_size > 10000:
+ score += 1
+ if img.get('alt') != '':
+ score+=1
+ if any(image_format==format for format in ['jpg','png','webp']):
+ score+=1
+ if index/images_count<0.5:
+ score+=1
+ return score
+
+ if not is_valid_image(img, img.parent, img.parent.get('class', [])):
+ return None
+ score = score_image_for_usefulness(img, url, index, total_images)
+ if score <= IMAGE_SCORE_THRESHOLD:
+ return None
+ return {
+ 'src': img.get('src', '').replace('\\"', '"').strip(),
+ 'alt': img.get('alt', ''),
+ 'desc': find_closest_parent_with_useful_text(img),
+ 'score': score,
+ 'type': 'image'
+ }
+
+ def process_element(element: element.PageElement) -> bool:
+ try:
+ if isinstance(element, NavigableString):
+ if isinstance(element, Comment):
+ element.extract()
+ return False
+
+ if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
+ element.decompose()
+ return False
+
+ keep_element = False
+
+ if element.name == 'a' and element.get('href'):
+ href = element['href']
+ url_base = url.split('/')[2]
+ link_data = {'href': href, 'text': element.get_text()}
+ if href.startswith('http') and url_base not in href:
+ links['external'].append(link_data)
+ else:
+ links['internal'].append(link_data)
+ keep_element = True
+
+ elif element.name == 'img':
+ return True # Always keep image elements
+
+ elif element.name in ['video', 'audio']:
+ media[f"{element.name}s"].append({
+ 'src': element.get('src'),
+ 'alt': element.get('alt'),
+ 'type': element.name,
+ 'description': find_closest_parent_with_useful_text(element)
+ })
+ source_tags = element.find_all('source')
+ for source_tag in source_tags:
+ media[f"{element.name}s"].append({
+ 'src': source_tag.get('src'),
+ 'alt': element.get('alt'),
+ 'type': element.name,
+ 'description': find_closest_parent_with_useful_text(element)
+ })
+ return True # Always keep video and audio elements
+
+ if element.name != 'pre':
+ if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']:
+ if kwargs.get('only_text', False):
+ element.replace_with(element.get_text())
+ else:
+ element.unwrap()
+ elif element.name != 'img':
+ element.attrs = {}
+
+ # Process children
+ for child in list(element.children):
+ if isinstance(child, NavigableString) and not isinstance(child, Comment):
+ if len(child.strip()) > 0:
+ keep_element = True
+ else:
+ if process_element(child):
+ keep_element = True
+
+
+ # Check word count
+ if not keep_element:
+ word_count = len(element.get_text(strip=True).split())
+ keep_element = word_count >= word_count_threshold
+
+ if not keep_element:
+ element.decompose()
+
+ return keep_element
+ except Exception as e:
+ print('Error processing element:', str(e))
+ return False
+
+ #process images by filtering and extracting contextual text from the page
+ imgs = body.find_all('img')
+ media['images'] = [
+ result for result in
+ (process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs))
+ if result is not None
+ ]
+
+ process_element(body)
+
+ def flatten_nested_elements(node):
+ if isinstance(node, NavigableString):
+ return node
+ if len(node.contents) == 1 and isinstance(node.contents[0], element.Tag) and node.contents[0].name == node.name:
+ return flatten_nested_elements(node.contents[0])
+ node.contents = [flatten_nested_elements(child) for child in node.contents]
+ return node
+
+ body = flatten_nested_elements(body)
+ base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
+ for img in imgs:
+ try:
+ src = img.get('src', '')
+ if base64_pattern.match(src):
+ img['src'] = base64_pattern.sub('', src)
+ except:
+ pass
+
+ cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
+ cleaned_html = sanitize_html(cleaned_html)
+
+ h = CustomHTML2Text()
+ h.ignore_links = True
+ markdown = h.handle(cleaned_html)
+ markdown = markdown.replace(' ```', '```')
+
+ try:
+ meta = extract_metadata(html, soup)
+ except Exception as e:
+ print('Error extracting metadata:', str(e))
+ meta = {}
+
+ return {
+ 'markdown': markdown,
+ 'cleaned_html': cleaned_html,
+ 'success': True,
+ 'media': media,
+ 'links': links,
+ 'metadata': meta
+ }
+
+def extract_metadata(html, soup=None):
+ """
+ Extract optimized content, media, and links from website HTML.
+
+ How it works:
+ 1. Similar to `get_content_of_website`, but optimized for performance.
+ 2. Filters and scores images for usefulness.
+ 3. Extracts contextual descriptions for media files.
+ 4. Handles excluded tags and CSS selectors.
+ 5. Cleans HTML and converts it to Markdown.
+
+ Args:
+ url (str): The website URL.
+ html (str): The HTML content of the website.
+ word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
+ css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.
+ **kwargs: Additional options for customization.
+
+ Returns:
+ Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
+ """
+
+ metadata = {}
+
+ if not html and not soup:
+ return {}
+
+ if not soup:
+ soup = BeautifulSoup(html, 'lxml')
+
+ head = soup.head
+ if not head:
+ return metadata
+
+ # Title
+ title_tag = head.find('title')
+ metadata['title'] = title_tag.string.strip() if title_tag and title_tag.string else None
+
+ # Meta description
+ description_tag = head.find('meta', attrs={'name': 'description'})
+ metadata['description'] = description_tag.get('content', '').strip() if description_tag else None
+
+ # Meta keywords
+ keywords_tag = head.find('meta', attrs={'name': 'keywords'})
+ metadata['keywords'] = keywords_tag.get('content', '').strip() if keywords_tag else None
+
+ # Meta author
+ author_tag = head.find('meta', attrs={'name': 'author'})
+ metadata['author'] = author_tag.get('content', '').strip() if author_tag else None
+
+ # Open Graph metadata
+ og_tags = head.find_all('meta', attrs={'property': re.compile(r'^og:')})
+ for tag in og_tags:
+ property_name = tag.get('property', '').strip()
+ content = tag.get('content', '').strip()
+ if property_name and content:
+ metadata[property_name] = content
+
+ # Twitter Card metadata
+ twitter_tags = head.find_all('meta', attrs={'name': re.compile(r'^twitter:')})
+ for tag in twitter_tags:
+ property_name = tag.get('name', '').strip()
+ content = tag.get('content', '').strip()
+ if property_name and content:
+ metadata[property_name] = content
+
+ return metadata
+
+def extract_xml_tags(string):
+ """
+ Extracts XML tags from a string.
+
+ Args:
+ string (str): The input string containing XML tags.
+
+ Returns:
+ List[str]: A list of XML tags extracted from the input string.
+ """
+ tags = re.findall(r'<(\w+)>', string)
+ return list(set(tags))
+
+def extract_xml_data(tags, string):
+ """
+ Extract data for specified XML tags from a string.
+
+ How it works:
+ 1. Searches the string for each tag using regex.
+ 2. Extracts the content within the tags.
+ 3. Returns a dictionary of tag-content pairs.
+
+ Args:
+ tags (List[str]): The list of XML tags to extract.
+ string (str): The input string containing XML data.
+
+ Returns:
+ Dict[str, str]: A dictionary with tag names as keys and extracted content as values.
+ """
+
+ data = {}
+
+ for tag in tags:
+ pattern = f"<{tag}>(.*?){tag}>"
+ match = re.search(pattern, string, re.DOTALL)
+ if match:
+ data[tag] = match.group(1).strip()
+ else:
+ data[tag] = ""
+
+ return data
+
+def perform_completion_with_backoff(
+ provider,
+ prompt_with_variables,
+ api_token,
+ json_response = False,
+ base_url=None,
+ **kwargs
+ ):
+ """
+ Perform an API completion request with exponential backoff.
+
+ How it works:
+ 1. Sends a completion request to the API.
+ 2. Retries on rate-limit errors with exponential delays.
+ 3. Returns the API response or an error after all retries.
+
+ Args:
+ provider (str): The name of the API provider.
+ prompt_with_variables (str): The input prompt for the completion request.
+ api_token (str): The API token for authentication.
+ json_response (bool): Whether to request a JSON response. Defaults to False.
+ base_url (Optional[str]): The base URL for the API. Defaults to None.
+ **kwargs: Additional arguments for the API request.
+
+ Returns:
+ dict: The API response or an error message after all retries.
+ """
+
+ from litellm import completion
+ from litellm.exceptions import RateLimitError
+ max_attempts = 3
+ base_delay = 2 # Base delay in seconds, you can adjust this based on your needs
+
+ extra_args = {
+ "temperature": 0.01,
+ 'api_key': api_token,
+ 'base_url': base_url
+ }
+ if json_response:
+ extra_args["response_format"] = { "type": "json_object" }
+
+ if kwargs.get("extra_args"):
+ extra_args.update(kwargs["extra_args"])
+
+ for attempt in range(max_attempts):
+ try:
+
+ response =completion(
+ model=provider,
+ messages=[
+ {"role": "user", "content": prompt_with_variables}
+ ],
+ **extra_args
+ )
+ return response # Return the successful response
+ except RateLimitError as e:
+ print("Rate limit error:", str(e))
+
+ # Check if we have exhausted our max attempts
+ if attempt < max_attempts - 1:
+ # Calculate the delay and wait
+ delay = base_delay * (2 ** attempt) # Exponential backoff formula
+ print(f"Waiting for {delay} seconds before retrying...")
+ time.sleep(delay)
+ else:
+ # Return an error response after exhausting all retries
+ return [{
+ "index": 0,
+ "tags": ["error"],
+ "content": ["Rate limit error. Please try again later."]
+ }]
+
+def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, base_url = None):
+ """
+ Extract content blocks from website HTML using an AI provider.
+
+ How it works:
+ 1. Prepares a prompt by sanitizing and escaping HTML.
+ 2. Sends the prompt to an AI provider with optional retries.
+ 3. Parses the response to extract structured blocks or errors.
+
+ Args:
+ url (str): The website URL.
+ html (str): The HTML content of the website.
+ provider (str): The AI provider for content extraction. Defaults to DEFAULT_PROVIDER.
+ api_token (Optional[str]): The API token for authentication. Defaults to None.
+ base_url (Optional[str]): The base URL for the API. Defaults to None.
+
+ Returns:
+ List[dict]: A list of extracted content blocks.
+ """
+
+ # api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
+ api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
+
+ variable_values = {
+ "URL": url,
+ "HTML": escape_json_string(sanitize_html(html)),
+ }
+
+ prompt_with_variables = PROMPT_EXTRACT_BLOCKS
+ for variable in variable_values:
+ prompt_with_variables = prompt_with_variables.replace(
+ "{" + variable + "}", variable_values[variable]
+ )
+
+ response = perform_completion_with_backoff(provider, prompt_with_variables, api_token, base_url=base_url)
+
+ try:
+ blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
+ blocks = json.loads(blocks)
+ ## Add error: False to the blocks
+ for block in blocks:
+ block['error'] = False
+ except Exception as e:
+ parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
+ blocks = parsed
+ # Append all unparsed segments as onr error block and content is list of unparsed segments
+ if unparsed:
+ blocks.append({
+ "index": 0,
+ "error": True,
+ "tags": ["error"],
+ "content": unparsed
+ })
+ return blocks
+
+def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_token = None):
+ """
+ Extract content blocks from a batch of website HTMLs.
+
+ How it works:
+ 1. Prepares prompts for each URL and HTML pair.
+ 2. Sends the prompts to the AI provider in a batch request.
+ 3. Parses the responses to extract structured blocks or errors.
+
+ Args:
+ batch_data (List[Tuple[str, str]]): A list of (URL, HTML) pairs.
+ provider (str): The AI provider for content extraction. Defaults to "groq/llama3-70b-8192".
+ api_token (Optional[str]): The API token for authentication. Defaults to None.
+
+ Returns:
+ List[dict]: A list of extracted content blocks from all batch items.
+ """
+
+ api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
+ from litellm import batch_completion
+ messages = []
+
+ for url, html in batch_data:
+ variable_values = {
+ "URL": url,
+ "HTML": html,
+ }
+
+ prompt_with_variables = PROMPT_EXTRACT_BLOCKS
+ for variable in variable_values:
+ prompt_with_variables = prompt_with_variables.replace(
+ "{" + variable + "}", variable_values[variable]
+ )
+
+ messages.append([{"role": "user", "content": prompt_with_variables}])
+
+
+ responses = batch_completion(
+ model = provider,
+ messages = messages,
+ temperature = 0.01
+ )
+
+ all_blocks = []
+ for response in responses:
+ try:
+ blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
+ blocks = json.loads(blocks)
+
+ except Exception as e:
+ blocks = [{
+ "index": 0,
+ "tags": ["error"],
+ "content": ["Error extracting blocks from the HTML content. Choose another provider/model or try again."],
+ "questions": ["What went wrong during the block extraction process?"]
+ }]
+ all_blocks.append(blocks)
+
+ return sum(all_blocks, [])
+
+def merge_chunks_based_on_token_threshold(chunks, token_threshold):
+ """
+ Merges small chunks into larger ones based on the total token threshold.
+
+ :param chunks: List of text chunks to be merged based on token count.
+ :param token_threshold: Max number of tokens for each merged chunk.
+ :return: List of merged text chunks.
+ """
+ merged_sections = []
+ current_chunk = []
+ total_token_so_far = 0
+
+ for chunk in chunks:
+ chunk_token_count = len(chunk.split()) * 1.3 # Estimate token count with a factor
+ if total_token_so_far + chunk_token_count < token_threshold:
+ current_chunk.append(chunk)
+ total_token_so_far += chunk_token_count
+ else:
+ if current_chunk:
+ merged_sections.append('\n\n'.join(current_chunk))
+ current_chunk = [chunk]
+ total_token_so_far = chunk_token_count
+
+ # Add the last chunk if it exists
+ if current_chunk:
+ merged_sections.append('\n\n'.join(current_chunk))
+
+ return merged_sections
+
+def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list:
+ """
+ Process sections of HTML content sequentially or in parallel.
+
+ How it works:
+ 1. Sequentially processes sections with delays for "groq/" providers.
+ 2. Uses ThreadPoolExecutor for parallel processing with other providers.
+ 3. Extracts content blocks for each section.
+
+ Args:
+ url (str): The website URL.
+ sections (List[str]): The list of HTML sections to process.
+ provider (str): The AI provider for content extraction.
+ api_token (str): The API token for authentication.
+ base_url (Optional[str]): The base URL for the API. Defaults to None.
+
+ Returns:
+ List[dict]: The list of extracted content blocks from all sections.
+ """
+
+ extracted_content = []
+ if provider.startswith("groq/"):
+ # Sequential processing with a delay
+ for section in sections:
+ extracted_content.extend(extract_blocks(url, section, provider, api_token, base_url=base_url))
+ time.sleep(0.5) # 500 ms delay between each processing
+ else:
+ # Parallel processing using ThreadPoolExecutor
+ with ThreadPoolExecutor() as executor:
+ futures = [executor.submit(extract_blocks, url, section, provider, api_token, base_url=base_url) for section in sections]
+ for future in as_completed(futures):
+ extracted_content.extend(future.result())
+
+ return extracted_content
+
+def wrap_text(draw, text, font, max_width):
+ """
+ Wrap text to fit within a specified width for rendering.
+
+ How it works:
+ 1. Splits the text into words.
+ 2. Constructs lines that fit within the maximum width using the provided font.
+ 3. Returns the wrapped text as a single string.
+
+ Args:
+ draw (ImageDraw.Draw): The drawing context for measuring text size.
+ text (str): The text to wrap.
+ font (ImageFont.FreeTypeFont): The font to use for measuring text size.
+ max_width (int): The maximum width for each line.
+
+ Returns:
+ str: The wrapped text.
+ """
+
+ # Wrap the text to fit within the specified width
+ lines = []
+ words = text.split()
+ while words:
+ line = ''
+ while words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width:
+ line += (words.pop(0) + ' ')
+ lines.append(line)
+ return '\n'.join(lines)
+
+def format_html(html_string):
+ """
+ Prettify an HTML string using BeautifulSoup.
+
+ How it works:
+ 1. Parses the HTML string with BeautifulSoup.
+ 2. Formats the HTML with proper indentation.
+ 3. Returns the prettified HTML string.
+
+ Args:
+ html_string (str): The HTML string to format.
+
+ Returns:
+ str: The prettified HTML string.
+ """
+
+ soup = BeautifulSoup(html_string, 'lxml.parser')
+ return soup.prettify()
+
+def fast_format_html(html_string):
+ """
+ A fast HTML formatter that uses string operations instead of parsing.
+
+ Args:
+ html_string (str): The HTML string to format
+
+ Returns:
+ str: The formatted HTML string
+ """
+ # Initialize variables
+ indent = 0
+ indent_str = " " # Two spaces for indentation
+ formatted = []
+ in_content = False
+
+ # Split by < and > to separate tags and content
+ parts = html_string.replace('>', '>\n').replace('<', '\n<').split('\n')
+
+ for part in parts:
+ if not part.strip():
+ continue
+
+ # Handle closing tags
+ if part.startswith(''):
+ indent -= 1
+ formatted.append(indent_str * indent + part)
+
+ # Handle self-closing tags
+ elif part.startswith('<') and part.endswith('/>'):
+ formatted.append(indent_str * indent + part)
+
+ # Handle opening tags
+ elif part.startswith('<'):
+ formatted.append(indent_str * indent + part)
+ indent += 1
+
+ # Handle content between tags
+ else:
+ content = part.strip()
+ if content:
+ formatted.append(indent_str * indent + content)
+
+ return '\n'.join(formatted)
+
+def normalize_url(href, base_url):
+ """Normalize URLs to ensure consistent format"""
+ from urllib.parse import urljoin, urlparse
+
+ # Parse base URL to get components
+ parsed_base = urlparse(base_url)
+ if not parsed_base.scheme or not parsed_base.netloc:
+ raise ValueError(f"Invalid base URL format: {base_url}")
+
+ # Use urljoin to handle all cases
+ normalized = urljoin(base_url, href.strip())
+ return normalized
+
+def normalize_url_tmp(href, base_url):
+ """Normalize URLs to ensure consistent format"""
+ # Extract protocol and domain from base URL
+ try:
+ base_parts = base_url.split('/')
+ protocol = base_parts[0]
+ domain = base_parts[2]
+ except IndexError:
+ raise ValueError(f"Invalid base URL format: {base_url}")
+
+ # Handle special protocols
+ special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
+ if any(href.lower().startswith(proto) for proto in special_protocols):
+ return href.strip()
+
+ # Handle anchor links
+ if href.startswith('#'):
+ return f"{base_url}{href}"
+
+ # Handle protocol-relative URLs
+ if href.startswith('//'):
+ return f"{protocol}{href}"
+
+ # Handle root-relative URLs
+ if href.startswith('/'):
+ return f"{protocol}//{domain}{href}"
+
+ # Handle relative URLs
+ if not href.startswith(('http://', 'https://')):
+ # Remove leading './' if present
+ href = href.lstrip('./')
+ return f"{protocol}//{domain}/{href}"
+
+ return href.strip()
+
+def get_base_domain(url: str) -> str:
+ """
+ Extract the base domain from a given URL, handling common edge cases.
+
+ How it works:
+ 1. Parses the URL to extract the domain.
+ 2. Removes the port number and 'www' prefix.
+ 3. Handles special domains (e.g., 'co.uk') to extract the correct base.
+
+ Args:
+ url (str): The URL to extract the base domain from.
+
+ Returns:
+ str: The extracted base domain or an empty string if parsing fails.
+ """
+ try:
+ # Get domain from URL
+ domain = urlparse(url).netloc.lower()
+ if not domain:
+ return ""
+
+ # Remove port if present
+ domain = domain.split(':')[0]
+
+ # Remove www
+ domain = re.sub(r'^www\.', '', domain)
+
+ # Extract last two parts of domain (handles co.uk etc)
+ parts = domain.split('.')
+ if len(parts) > 2 and parts[-2] in {
+ 'co', 'com', 'org', 'gov', 'edu', 'net',
+ 'mil', 'int', 'ac', 'ad', 'ae', 'af', 'ag'
+ }:
+ return '.'.join(parts[-3:])
+
+ return '.'.join(parts[-2:])
+ except Exception:
+ return ""
+
+def is_external_url(url: str, base_domain: str) -> bool:
+ """
+ Extract the base domain from a given URL, handling common edge cases.
+
+ How it works:
+ 1. Parses the URL to extract the domain.
+ 2. Removes the port number and 'www' prefix.
+ 3. Handles special domains (e.g., 'co.uk') to extract the correct base.
+
+ Args:
+ url (str): The URL to extract the base domain from.
+
+ Returns:
+ str: The extracted base domain or an empty string if parsing fails.
+ """
+ special = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
+ if any(url.lower().startswith(p) for p in special):
+ return True
+
+ try:
+ parsed = urlparse(url)
+ if not parsed.netloc: # Relative URL
+ return False
+
+ # Strip 'www.' from both domains for comparison
+ url_domain = parsed.netloc.lower().replace('www.', '')
+ base = base_domain.lower().replace('www.', '')
+
+ # Check if URL domain ends with base domain
+ return not url_domain.endswith(base)
+ except Exception:
+ return False
+
+def clean_tokens(tokens: list[str]) -> list[str]:
+ """
+ Clean a list of tokens by removing noise, stop words, and short tokens.
+
+ How it works:
+ 1. Defines a set of noise words and stop words.
+ 2. Filters tokens based on length and exclusion criteria.
+ 3. Excludes tokens starting with certain symbols (e.g., "↑", "▲").
+
+ Args:
+ tokens (list[str]): The list of tokens to clean.
+
+ Returns:
+ list[str]: The cleaned list of tokens.
+ """
+
+ # Set of tokens to remove
+ noise = {'ccp', 'up', '↑', '▲', '⬆️', 'a', 'an', 'at', 'by', 'in', 'of', 'on', 'to', 'the'}
+
+ STOP_WORDS = {
+ 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
+ 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
+ 'to', 'was', 'were', 'will', 'with',
+
+ # Pronouns
+ 'i', 'you', 'he', 'she', 'it', 'we', 'they',
+ 'me', 'him', 'her', 'us', 'them',
+ 'my', 'your', 'his', 'her', 'its', 'our', 'their',
+ 'mine', 'yours', 'hers', 'ours', 'theirs',
+ 'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves',
+
+ # Common verbs
+ 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
+ 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
+
+ # Prepositions
+ 'about', 'above', 'across', 'after', 'against', 'along', 'among', 'around',
+ 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond',
+ 'by', 'down', 'during', 'except', 'for', 'from', 'in', 'inside', 'into',
+ 'near', 'of', 'off', 'on', 'out', 'outside', 'over', 'past', 'through',
+ 'to', 'toward', 'under', 'underneath', 'until', 'up', 'upon', 'with', 'within',
+
+ # Conjunctions
+ 'and', 'but', 'or', 'nor', 'for', 'yet', 'so',
+ 'although', 'because', 'since', 'unless',
+
+ # Articles
+ 'a', 'an', 'the',
+
+ # Other common words
+ 'this', 'that', 'these', 'those',
+ 'what', 'which', 'who', 'whom', 'whose',
+ 'when', 'where', 'why', 'how',
+ 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
+ 'can', 'cannot', "can't", 'could', "couldn't",
+ 'may', 'might', 'must', "mustn't",
+ 'shall', 'should', "shouldn't",
+ 'will', "won't", 'would', "wouldn't",
+ 'not', "n't", 'no', 'nor', 'none'
+ }
+
+ # Single comprehension, more efficient than multiple passes
+ return [token for token in tokens
+ if len(token) > 2
+ and token not in noise
+ and token not in STOP_WORDS
+ and not token.startswith('↑')
+ and not token.startswith('▲')
+ and not token.startswith('⬆')]
+
+def profile_and_time(func):
+ """
+ Decorator to profile a function's execution time and performance.
+
+ How it works:
+ 1. Records the start time before executing the function.
+ 2. Profiles the function's execution using `cProfile`.
+ 3. Prints the elapsed time and profiling statistics.
+
+ Args:
+ func (Callable): The function to decorate.
+
+ Returns:
+ Callable: The decorated function with profiling and timing enabled.
+ """
+
+ @wraps(func)
+ def wrapper(self, *args, **kwargs):
+ # Start timer
+ start_time = time.perf_counter()
+
+ # Setup profiler
+ profiler = cProfile.Profile()
+ profiler.enable()
+
+ # Run function
+ result = func(self, *args, **kwargs)
+
+ # Stop profiler
+ profiler.disable()
+
+ # Calculate elapsed time
+ elapsed_time = time.perf_counter() - start_time
+
+ # Print timing
+ print(f"[PROFILER] Scraping completed in {elapsed_time:.2f} seconds")
+
+ # Print profiling stats
+ stats = pstats.Stats(profiler)
+ stats.sort_stats('cumulative') # Sort by cumulative time
+ stats.print_stats(20) # Print top 20 time-consuming functions
+
+ return result
+ return wrapper
+
+def generate_content_hash(content: str) -> str:
+ """Generate a unique hash for content"""
+ return xxhash.xxh64(content.encode()).hexdigest()
+ # return hashlib.sha256(content.encode()).hexdigest()
+
+def ensure_content_dirs(base_path: str) -> Dict[str, str]:
+ """Create content directories if they don't exist"""
+ dirs = {
+ 'html': 'html_content',
+ 'cleaned': 'cleaned_html',
+ 'markdown': 'markdown_content',
+ 'extracted': 'extracted_content',
+ 'screenshots': 'screenshots',
+ 'screenshot': 'screenshots'
+ }
+
+ content_paths = {}
+ for key, dirname in dirs.items():
+ path = os.path.join(base_path, dirname)
+ os.makedirs(path, exist_ok=True)
+ content_paths[key] = path
+
+ return content_paths
+
+def configure_windows_event_loop():
+ """
+ Configure the Windows event loop to use ProactorEventLoop.
+ This resolves the NotImplementedError that occurs on Windows when using asyncio subprocesses.
+
+ This function should only be called on Windows systems and before any async operations.
+ On non-Windows systems, this function does nothing.
+
+ Example:
+ ```python
+ from crawl4ai.async_configs import configure_windows_event_loop
+
+ # Call this before any async operations if you're on Windows
+ configure_windows_event_loop()
+ ```
+ """
+ if platform.system() == 'Windows':
+ asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
+
+def get_error_context(exc_info, context_lines: int = 5):
+ """
+ Extract error context with more reliable line number tracking.
+
+ Args:
+ exc_info: The exception info from sys.exc_info()
+ context_lines: Number of lines to show before and after the error
+
+ Returns:
+ dict: Error context information
+ """
+ import traceback
+ import linecache
+ import os
+
+ # Get the full traceback
+ tb = traceback.extract_tb(exc_info[2])
+
+ # Get the last frame (where the error occurred)
+ last_frame = tb[-1]
+ filename = last_frame.filename
+ line_no = last_frame.lineno
+ func_name = last_frame.name
+
+ # Get the source code context using linecache
+ # This is more reliable than inspect.getsourcelines
+ context_start = max(1, line_no - context_lines)
+ context_end = line_no + context_lines + 1
+
+ # Build the context lines with line numbers
+ context_lines = []
+ for i in range(context_start, context_end):
+ line = linecache.getline(filename, i)
+ if line:
+ # Remove any trailing whitespace/newlines and add the pointer for error line
+ line = line.rstrip()
+ pointer = '→' if i == line_no else ' '
+ context_lines.append(f"{i:4d} {pointer} {line}")
+
+ # Join the lines with newlines
+ code_context = '\n'.join(context_lines)
+
+ # Get relative path for cleaner output
+ try:
+ rel_path = os.path.relpath(filename)
+ except ValueError:
+ # Fallback if relpath fails (can happen on Windows with different drives)
+ rel_path = filename
+
+ return {
+ "filename": rel_path,
+ "line_no": line_no,
+ "function": func_name,
+ "code_context": code_context
+ }
+
+
+
\ No newline at end of file
diff --git a/crawl4ai/utils.scraping.py b/crawl4ai/utils.scraping.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/crawl4ai/version_manager.py b/crawl4ai/version_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ae2de2e937fae526351b8dfd586567d21bc7be3
--- /dev/null
+++ b/crawl4ai/version_manager.py
@@ -0,0 +1,30 @@
+# version_manager.py
+import os
+from pathlib import Path
+from packaging import version
+from . import __version__
+
+class VersionManager:
+ def __init__(self):
+ self.home_dir = Path.home() / ".crawl4ai"
+ self.version_file = self.home_dir / "version.txt"
+
+ def get_installed_version(self):
+ """Get the version recorded in home directory"""
+ if not self.version_file.exists():
+ return None
+ try:
+ return version.parse(self.version_file.read_text().strip())
+ except:
+ return None
+
+ def update_version(self):
+ """Update the version file to current library version"""
+ self.version_file.write_text(__version__.__version__)
+
+ def needs_update(self):
+ """Check if database needs update based on version"""
+ installed = self.get_installed_version()
+ current = version.parse(__version__.__version__)
+ return installed is None or installed < current
+
diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32a988d70767397102c054a95391fc00d0f8145
--- /dev/null
+++ b/crawl4ai/web_crawler.py
@@ -0,0 +1,253 @@
+import os, time
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from pathlib import Path
+
+from .models import UrlModel, CrawlResult
+from .database import init_db, get_cached_url, cache_url, DB_PATH, flush_db
+from .utils import *
+from .chunking_strategy import *
+from .extraction_strategy import *
+from .crawler_strategy import *
+from typing import List
+from concurrent.futures import ThreadPoolExecutor
+from .content_scraping_strategy import WebScrapingStrategy
+from .config import *
+import warnings
+import json
+warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace "model_".')
+
+
+class WebCrawler:
+ def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
+ self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
+ self.always_by_pass_cache = always_by_pass_cache
+ self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
+ os.makedirs(self.crawl4ai_folder, exist_ok=True)
+ os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
+ init_db()
+ self.ready = False
+
+ def warmup(self):
+ print("[LOG] 🌤️ Warming up the WebCrawler")
+ self.run(
+ url='https://google.com/',
+ word_count_threshold=5,
+ extraction_strategy=NoExtractionStrategy(),
+ bypass_cache=False,
+ verbose=False
+ )
+ self.ready = True
+ print("[LOG] 🌞 WebCrawler is ready to crawl")
+
+ def fetch_page(
+ self,
+ url_model: UrlModel,
+ provider: str = DEFAULT_PROVIDER,
+ api_token: str = None,
+ extract_blocks_flag: bool = True,
+ word_count_threshold=MIN_WORD_THRESHOLD,
+ css_selector: str = None,
+ screenshot: bool = False,
+ use_cached_html: bool = False,
+ extraction_strategy: ExtractionStrategy = None,
+ chunking_strategy: ChunkingStrategy = RegexChunking(),
+ **kwargs,
+ ) -> CrawlResult:
+ return self.run(
+ url_model.url,
+ word_count_threshold,
+ extraction_strategy or NoExtractionStrategy(),
+ chunking_strategy,
+ bypass_cache=url_model.forced,
+ css_selector=css_selector,
+ screenshot=screenshot,
+ **kwargs,
+ )
+ pass
+
+ def fetch_pages(
+ self,
+ url_models: List[UrlModel],
+ provider: str = DEFAULT_PROVIDER,
+ api_token: str = None,
+ extract_blocks_flag: bool = True,
+ word_count_threshold=MIN_WORD_THRESHOLD,
+ use_cached_html: bool = False,
+ css_selector: str = None,
+ screenshot: bool = False,
+ extraction_strategy: ExtractionStrategy = None,
+ chunking_strategy: ChunkingStrategy = RegexChunking(),
+ **kwargs,
+ ) -> List[CrawlResult]:
+ extraction_strategy = extraction_strategy or NoExtractionStrategy()
+ def fetch_page_wrapper(url_model, *args, **kwargs):
+ return self.fetch_page(url_model, *args, **kwargs)
+
+ with ThreadPoolExecutor() as executor:
+ results = list(
+ executor.map(
+ fetch_page_wrapper,
+ url_models,
+ [provider] * len(url_models),
+ [api_token] * len(url_models),
+ [extract_blocks_flag] * len(url_models),
+ [word_count_threshold] * len(url_models),
+ [css_selector] * len(url_models),
+ [screenshot] * len(url_models),
+ [use_cached_html] * len(url_models),
+ [extraction_strategy] * len(url_models),
+ [chunking_strategy] * len(url_models),
+ *[kwargs] * len(url_models),
+ )
+ )
+
+ return results
+
+ def run(
+ self,
+ url: str,
+ word_count_threshold=MIN_WORD_THRESHOLD,
+ extraction_strategy: ExtractionStrategy = None,
+ chunking_strategy: ChunkingStrategy = RegexChunking(),
+ bypass_cache: bool = False,
+ css_selector: str = None,
+ screenshot: bool = False,
+ user_agent: str = None,
+ verbose=True,
+ **kwargs,
+ ) -> CrawlResult:
+ try:
+ extraction_strategy = extraction_strategy or NoExtractionStrategy()
+ extraction_strategy.verbose = verbose
+ if not isinstance(extraction_strategy, ExtractionStrategy):
+ raise ValueError("Unsupported extraction strategy")
+ if not isinstance(chunking_strategy, ChunkingStrategy):
+ raise ValueError("Unsupported chunking strategy")
+
+ word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
+
+ cached = None
+ screenshot_data = None
+ extracted_content = None
+ if not bypass_cache and not self.always_by_pass_cache:
+ cached = get_cached_url(url)
+
+ if kwargs.get("warmup", True) and not self.ready:
+ return None
+
+ if cached:
+ html = sanitize_input_encode(cached[1])
+ extracted_content = sanitize_input_encode(cached[4])
+ if screenshot:
+ screenshot_data = cached[9]
+ if not screenshot_data:
+ cached = None
+
+ if not cached or not html:
+ if user_agent:
+ self.crawler_strategy.update_user_agent(user_agent)
+ t1 = time.time()
+ html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
+ t2 = time.time()
+ if verbose:
+ print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds")
+ if screenshot:
+ screenshot_data = self.crawler_strategy.take_screenshot()
+
+
+ crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
+ crawl_result.success = bool(html)
+ return crawl_result
+ except Exception as e:
+ if not hasattr(e, "msg"):
+ e.msg = str(e)
+ print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
+ return CrawlResult(url=url, html="", success=False, error_message=e.msg)
+
+ def process_html(
+ self,
+ url: str,
+ html: str,
+ extracted_content: str,
+ word_count_threshold: int,
+ extraction_strategy: ExtractionStrategy,
+ chunking_strategy: ChunkingStrategy,
+ css_selector: str,
+ screenshot: bool,
+ verbose: bool,
+ is_cached: bool,
+ **kwargs,
+ ) -> CrawlResult:
+ t = time.time()
+ # Extract content from HTML
+ try:
+ t1 = time.time()
+ scrapping_strategy = WebScrapingStrategy()
+ extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]}
+ result = scrapping_strategy.scrap(
+ url,
+ html,
+ word_count_threshold=word_count_threshold,
+ css_selector=css_selector,
+ only_text=kwargs.get("only_text", False),
+ image_description_min_word_threshold=kwargs.get(
+ "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
+ ),
+ **extra_params,
+ )
+
+ # result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
+ if verbose:
+ print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds")
+
+ if result is None:
+ raise ValueError(f"Failed to extract content from the website: {url}")
+ except InvalidCSSSelectorError as e:
+ raise ValueError(str(e))
+
+ cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
+ markdown = sanitize_input_encode(result.get("markdown", ""))
+ media = result.get("media", [])
+ links = result.get("links", [])
+ metadata = result.get("metadata", {})
+
+ if extracted_content is None:
+ if verbose:
+ print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
+
+ sections = chunking_strategy.chunk(markdown)
+ extracted_content = extraction_strategy.run(url, sections)
+ extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
+
+ if verbose:
+ print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds.")
+
+ screenshot = None if not screenshot else screenshot
+
+ if not is_cached:
+ cache_url(
+ url,
+ html,
+ cleaned_html,
+ markdown,
+ extracted_content,
+ True,
+ json.dumps(media),
+ json.dumps(links),
+ json.dumps(metadata),
+ screenshot=screenshot,
+ )
+
+ return CrawlResult(
+ url=url,
+ html=html,
+ cleaned_html=format_html(cleaned_html),
+ markdown=markdown,
+ media=media,
+ links=links,
+ metadata=metadata,
+ screenshot=screenshot,
+ extracted_content=extracted_content,
+ success=True,
+ error_message="",
+ )
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4b22fd9846cc0185ffe281e85ae4378538de282f
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,67 @@
+services:
+ # Local build services for different platforms
+ crawl4ai-amd64:
+ build:
+ context: .
+ dockerfile: Dockerfile
+ args:
+ PYTHON_VERSION: "3.10"
+ INSTALL_TYPE: ${INSTALL_TYPE:-basic}
+ ENABLE_GPU: false
+ platforms:
+ - linux/amd64
+ profiles: ["local-amd64"]
+ extends: &base-config
+ file: docker-compose.yml
+ service: base-config
+
+ crawl4ai-arm64:
+ build:
+ context: .
+ dockerfile: Dockerfile
+ args:
+ PYTHON_VERSION: "3.10"
+ INSTALL_TYPE: ${INSTALL_TYPE:-basic}
+ ENABLE_GPU: false
+ platforms:
+ - linux/arm64
+ profiles: ["local-arm64"]
+ extends: *base-config
+
+ # Hub services for different platforms and versions
+ crawl4ai-hub-amd64:
+ image: unclecode/crawl4ai:${VERSION:-basic}-amd64
+ profiles: ["hub-amd64"]
+ extends: *base-config
+
+ crawl4ai-hub-arm64:
+ image: unclecode/crawl4ai:${VERSION:-basic}-arm64
+ profiles: ["hub-arm64"]
+ extends: *base-config
+
+ # Base configuration to be extended
+ base-config:
+ ports:
+ - "11235:11235"
+ - "8000:8000"
+ - "9222:9222"
+ - "8080:8080"
+ environment:
+ - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
+ - OPENAI_API_KEY=${OPENAI_API_KEY:-}
+ - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
+ volumes:
+ - /dev/shm:/dev/shm
+ deploy:
+ resources:
+ limits:
+ memory: 4G
+ reservations:
+ memory: 1G
+ restart: unless-stopped
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
+ interval: 30s
+ timeout: 10s
+ retries: 3
+ start_period: 40s
\ No newline at end of file
diff --git a/docs/assets/pitch-dark.png b/docs/assets/pitch-dark.png
new file mode 100644
index 0000000000000000000000000000000000000000..9b9b37b5dc2d7645a267b733a0d2b4916f5af203
Binary files /dev/null and b/docs/assets/pitch-dark.png differ
diff --git a/docs/assets/pitch-dark.svg b/docs/assets/pitch-dark.svg
new file mode 100644
index 0000000000000000000000000000000000000000..0913b2dad5f90ff96e73cd690ff315d0adb675d2
--- /dev/null
+++ b/docs/assets/pitch-dark.svg
@@ -0,0 +1,64 @@
+
+
+
+
+
+
+
+
+ Data Capitalization Opportunity
+
+ Transform digital footprints into assets
+ Personal data as capital
+ Enterprise knowledge valuation
+ New form of wealth creation
+
+
+
+
+ Authentic Data Potential
+
+ Vast reservoir of real insights
+ Enhanced AI development
+ Diverse human knowledge
+ Willing participation model
+
+
+
+
+
+
+
+ 1. Open-Source Foundation
+ Data extraction engine & community development
+
+
+
+ 2. Data Capitalization Platform
+ Tools to structure & value digital assets
+
+
+
+ 3. Shared Data Marketplace
+ Economic platform for data exchange
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Economic Vision: Shared Data Economy
+
+
\ No newline at end of file
diff --git a/docs/deprecated/docker-deployment.md b/docs/deprecated/docker-deployment.md
new file mode 100644
index 0000000000000000000000000000000000000000..db8446e324b4f76a8b919e39662ba2f5cb7e52c5
--- /dev/null
+++ b/docs/deprecated/docker-deployment.md
@@ -0,0 +1,189 @@
+# 🐳 Using Docker (Legacy)
+
+Crawl4AI is available as Docker images for easy deployment. You can either pull directly from Docker Hub (recommended) or build from the repository.
+
+---
+
+
+🐳 Option 1: Docker Hub (Recommended)
+
+Choose the appropriate image based on your platform and needs:
+
+### For AMD64 (Regular Linux/Windows):
+```bash
+# Basic version (recommended)
+docker pull unclecode/crawl4ai:basic-amd64
+docker run -p 11235:11235 unclecode/crawl4ai:basic-amd64
+
+# Full ML/LLM support
+docker pull unclecode/crawl4ai:all-amd64
+docker run -p 11235:11235 unclecode/crawl4ai:all-amd64
+
+# With GPU support
+docker pull unclecode/crawl4ai:gpu-amd64
+docker run -p 11235:11235 unclecode/crawl4ai:gpu-amd64
+```
+
+### For ARM64 (M1/M2 Macs, ARM servers):
+```bash
+# Basic version (recommended)
+docker pull unclecode/crawl4ai:basic-arm64
+docker run -p 11235:11235 unclecode/crawl4ai:basic-arm64
+
+# Full ML/LLM support
+docker pull unclecode/crawl4ai:all-arm64
+docker run -p 11235:11235 unclecode/crawl4ai:all-arm64
+
+# With GPU support
+docker pull unclecode/crawl4ai:gpu-arm64
+docker run -p 11235:11235 unclecode/crawl4ai:gpu-arm64
+```
+
+Need more memory? Add `--shm-size`:
+```bash
+docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-amd64
+```
+
+Test the installation:
+```bash
+curl http://localhost:11235/health
+```
+
+### For Raspberry Pi (32-bit) (coming soon):
+```bash
+# Pull and run basic version (recommended for Raspberry Pi)
+docker pull unclecode/crawl4ai:basic-armv7
+docker run -p 11235:11235 unclecode/crawl4ai:basic-armv7
+
+# With increased shared memory if needed
+docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-armv7
+```
+
+Note: Due to hardware constraints, only the basic version is recommended for Raspberry Pi.
+
+
+
+
+🐳 Option 2: Build from Repository
+
+Build the image locally based on your platform:
+
+```bash
+# Clone the repository
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+
+# For AMD64 (Regular Linux/Windows)
+docker build --platform linux/amd64 \
+ --tag crawl4ai:local \
+ --build-arg INSTALL_TYPE=basic \
+ .
+
+# For ARM64 (M1/M2 Macs, ARM servers)
+docker build --platform linux/arm64 \
+ --tag crawl4ai:local \
+ --build-arg INSTALL_TYPE=basic \
+ .
+```
+
+Build options:
+- INSTALL_TYPE=basic (default): Basic crawling features
+- INSTALL_TYPE=all: Full ML/LLM support
+- ENABLE_GPU=true: Add GPU support
+
+Example with all options:
+```bash
+docker build --platform linux/amd64 \
+ --tag crawl4ai:local \
+ --build-arg INSTALL_TYPE=all \
+ --build-arg ENABLE_GPU=true \
+ .
+```
+
+Run your local build:
+```bash
+# Regular run
+docker run -p 11235:11235 crawl4ai:local
+
+# With increased shared memory
+docker run --shm-size=2gb -p 11235:11235 crawl4ai:local
+```
+
+Test the installation:
+```bash
+curl http://localhost:11235/health
+```
+
+
+
+
+🐳 Option 3: Using Docker Compose
+
+Docker Compose provides a more structured way to run Crawl4AI, especially when dealing with environment variables and multiple configurations.
+
+```bash
+# Clone the repository
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+```
+
+### For AMD64 (Regular Linux/Windows):
+```bash
+# Build and run locally
+docker-compose --profile local-amd64 up
+
+# Run from Docker Hub
+VERSION=basic docker-compose --profile hub-amd64 up # Basic version
+VERSION=all docker-compose --profile hub-amd64 up # Full ML/LLM support
+VERSION=gpu docker-compose --profile hub-amd64 up # GPU support
+```
+
+### For ARM64 (M1/M2 Macs, ARM servers):
+```bash
+# Build and run locally
+docker-compose --profile local-arm64 up
+
+# Run from Docker Hub
+VERSION=basic docker-compose --profile hub-arm64 up # Basic version
+VERSION=all docker-compose --profile hub-arm64 up # Full ML/LLM support
+VERSION=gpu docker-compose --profile hub-arm64 up # GPU support
+```
+
+Environment variables (optional):
+```bash
+# Create a .env file
+CRAWL4AI_API_TOKEN=your_token
+OPENAI_API_KEY=your_openai_key
+CLAUDE_API_KEY=your_claude_key
+```
+
+The compose file includes:
+- Memory management (4GB limit, 1GB reserved)
+- Shared memory volume for browser support
+- Health checks
+- Auto-restart policy
+- All necessary port mappings
+
+Test the installation:
+```bash
+curl http://localhost:11235/health
+```
+
+
+
+
+🚀 One-Click Deployment
+
+Deploy your own instance of Crawl4AI with one click:
+
+[![DigitalOcean Referral Badge](https://web-platforms.sfo2.cdn.digitaloceanspaces.com/WWW/Badge%203.svg)](https://www.digitalocean.com/?repo=https://github.com/unclecode/crawl4ai/tree/0.3.74&refcode=a0780f1bdb3d&utm_campaign=Referral_Invite&utm_medium=Referral_Program&utm_source=badge)
+
+> 💡 **Recommended specs**: 4GB RAM minimum. Select "professional-xs" or higher when deploying for stable operation.
+
+The deploy will:
+- Set up a Docker container with Crawl4AI
+- Configure Playwright and all dependencies
+- Start the FastAPI server on port `11235`
+- Set up health checks and auto-deployment
+
+
diff --git a/docs/examples/amazon_product_extraction_direct_url.py b/docs/examples/amazon_product_extraction_direct_url.py
new file mode 100644
index 0000000000000000000000000000000000000000..769c479e3f040e74c08dbdcc586fcaaddb92d4a4
--- /dev/null
+++ b/docs/examples/amazon_product_extraction_direct_url.py
@@ -0,0 +1,114 @@
+"""
+This example demonstrates how to use JSON CSS extraction to scrape product information
+from Amazon search results. It shows how to extract structured data like product titles,
+prices, ratings, and other details using CSS selectors.
+"""
+
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+import json
+
+async def extract_amazon_products():
+ # Initialize browser config
+ browser_config = BrowserConfig(
+ browser_type="chromium",
+ headless=True
+ )
+
+ # Initialize crawler config with JSON CSS extraction strategy
+ crawler_config = CrawlerRunConfig(
+ extraction_strategy=JsonCssExtractionStrategy(
+ schema={
+ "name": "Amazon Product Search Results",
+ "baseSelector": "[data-component-type='s-search-result']",
+ "fields": [
+ {
+ "name": "asin",
+ "selector": "",
+ "type": "attribute",
+ "attribute": "data-asin"
+ },
+ {
+ "name": "title",
+ "selector": "h2 a span",
+ "type": "text"
+ },
+ {
+ "name": "url",
+ "selector": "h2 a",
+ "type": "attribute",
+ "attribute": "href"
+ },
+ {
+ "name": "image",
+ "selector": ".s-image",
+ "type": "attribute",
+ "attribute": "src"
+ },
+ {
+ "name": "rating",
+ "selector": ".a-icon-star-small .a-icon-alt",
+ "type": "text"
+ },
+ {
+ "name": "reviews_count",
+ "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
+ "type": "text"
+ },
+ {
+ "name": "price",
+ "selector": ".a-price .a-offscreen",
+ "type": "text"
+ },
+ {
+ "name": "original_price",
+ "selector": ".a-price.a-text-price .a-offscreen",
+ "type": "text"
+ },
+ {
+ "name": "sponsored",
+ "selector": ".puis-sponsored-label-text",
+ "type": "exists"
+ },
+ {
+ "name": "delivery_info",
+ "selector": "[data-cy='delivery-recipe'] .a-color-base",
+ "type": "text",
+ "multiple": True
+ }
+ ]
+ }
+ )
+ )
+
+ # Example search URL (you should replace with your actual Amazon URL)
+ url = "https://www.amazon.com/s?k=Samsung+Galaxy+Tab"
+
+ # Use context manager for proper resource handling
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ # Extract the data
+ result = await crawler.arun(url=url, config=crawler_config)
+
+ # Process and print the results
+ if result and result.extracted_content:
+ # Parse the JSON string into a list of products
+ products = json.loads(result.extracted_content)
+
+ # Process each product in the list
+ for product in products:
+ print("\nProduct Details:")
+ print(f"ASIN: {product.get('asin')}")
+ print(f"Title: {product.get('title')}")
+ print(f"Price: {product.get('price')}")
+ print(f"Original Price: {product.get('original_price')}")
+ print(f"Rating: {product.get('rating')}")
+ print(f"Reviews: {product.get('reviews_count')}")
+ print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
+ if product.get('delivery_info'):
+ print(f"Delivery: {' '.join(product['delivery_info'])}")
+ print("-" * 80)
+
+if __name__ == "__main__":
+ import asyncio
+ asyncio.run(extract_amazon_products())
diff --git a/docs/examples/amazon_product_extraction_using_hooks.py b/docs/examples/amazon_product_extraction_using_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..a17d60c5944101364897a4eaf620fef634d96e27
--- /dev/null
+++ b/docs/examples/amazon_product_extraction_using_hooks.py
@@ -0,0 +1,145 @@
+"""
+This example demonstrates how to use JSON CSS extraction to scrape product information
+from Amazon search results. It shows how to extract structured data like product titles,
+prices, ratings, and other details using CSS selectors.
+"""
+
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+import json
+from playwright.async_api import Page, BrowserContext
+
+async def extract_amazon_products():
+ # Initialize browser config
+ browser_config = BrowserConfig(
+ # browser_type="chromium",
+ headless=True
+ )
+
+ # Initialize crawler config with JSON CSS extraction strategy nav-search-submit-button
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+
+ extraction_strategy=JsonCssExtractionStrategy(
+ schema={
+ "name": "Amazon Product Search Results",
+ "baseSelector": "[data-component-type='s-search-result']",
+ "fields": [
+ {
+ "name": "asin",
+ "selector": "",
+ "type": "attribute",
+ "attribute": "data-asin"
+ },
+ {
+ "name": "title",
+ "selector": "h2 a span",
+ "type": "text"
+ },
+ {
+ "name": "url",
+ "selector": "h2 a",
+ "type": "attribute",
+ "attribute": "href"
+ },
+ {
+ "name": "image",
+ "selector": ".s-image",
+ "type": "attribute",
+ "attribute": "src"
+ },
+ {
+ "name": "rating",
+ "selector": ".a-icon-star-small .a-icon-alt",
+ "type": "text"
+ },
+ {
+ "name": "reviews_count",
+ "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
+ "type": "text"
+ },
+ {
+ "name": "price",
+ "selector": ".a-price .a-offscreen",
+ "type": "text"
+ },
+ {
+ "name": "original_price",
+ "selector": ".a-price.a-text-price .a-offscreen",
+ "type": "text"
+ },
+ {
+ "name": "sponsored",
+ "selector": ".puis-sponsored-label-text",
+ "type": "exists"
+ },
+ {
+ "name": "delivery_info",
+ "selector": "[data-cy='delivery-recipe'] .a-color-base",
+ "type": "text",
+ "multiple": True
+ }
+ ]
+ }
+ )
+ )
+
+ url = "https://www.amazon.com/"
+
+ async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
+ """Hook called after navigating to each URL"""
+ print(f"[HOOK] after_goto - Successfully loaded: {url}")
+
+ try:
+ # Wait for search box to be available
+ search_box = await page.wait_for_selector('#twotabsearchtextbox', timeout=1000)
+
+ # Type the search query
+ await search_box.fill('Samsung Galaxy Tab')
+
+ # Get the search button and prepare for navigation
+ search_button = await page.wait_for_selector('#nav-search-submit-button', timeout=1000)
+
+ # Click with navigation waiting
+ await search_button.click()
+
+ # Wait for search results to load
+ await page.wait_for_selector('[data-component-type="s-search-result"]', timeout=10000)
+ print("[HOOK] Search completed and results loaded!")
+
+ except Exception as e:
+ print(f"[HOOK] Error during search operation: {str(e)}")
+
+ return page
+
+ # Use context manager for proper resource handling
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+
+ crawler.crawler_strategy.set_hook("after_goto", after_goto)
+
+ # Extract the data
+ result = await crawler.arun(url=url, config=crawler_config)
+
+ # Process and print the results
+ if result and result.extracted_content:
+ # Parse the JSON string into a list of products
+ products = json.loads(result.extracted_content)
+
+ # Process each product in the list
+ for product in products:
+ print("\nProduct Details:")
+ print(f"ASIN: {product.get('asin')}")
+ print(f"Title: {product.get('title')}")
+ print(f"Price: {product.get('price')}")
+ print(f"Original Price: {product.get('original_price')}")
+ print(f"Rating: {product.get('rating')}")
+ print(f"Reviews: {product.get('reviews_count')}")
+ print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
+ if product.get('delivery_info'):
+ print(f"Delivery: {' '.join(product['delivery_info'])}")
+ print("-" * 80)
+
+if __name__ == "__main__":
+ import asyncio
+ asyncio.run(extract_amazon_products())
diff --git a/docs/examples/amazon_product_extraction_using_use_javascript.py b/docs/examples/amazon_product_extraction_using_use_javascript.py
new file mode 100644
index 0000000000000000000000000000000000000000..15e5d6f59a726852d38357a3ab16d523d3a108de
--- /dev/null
+++ b/docs/examples/amazon_product_extraction_using_use_javascript.py
@@ -0,0 +1,129 @@
+"""
+This example demonstrates how to use JSON CSS extraction to scrape product information
+from Amazon search results. It shows how to extract structured data like product titles,
+prices, ratings, and other details using CSS selectors.
+"""
+
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+import json
+from playwright.async_api import Page, BrowserContext
+
+async def extract_amazon_products():
+ # Initialize browser config
+ browser_config = BrowserConfig(
+ # browser_type="chromium",
+ headless=True
+ )
+
+ js_code_to_search = """
+ const task = async () => {
+ document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab';
+ document.querySelector('#nav-search-submit-button').click();
+ }
+ await task();
+ """
+ js_code_to_search_sync = """
+ document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab';
+ document.querySelector('#nav-search-submit-button').click();
+ """
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ js_code = js_code_to_search,
+ wait_for='css:[data-component-type="s-search-result"]',
+ extraction_strategy=JsonCssExtractionStrategy(
+ schema={
+ "name": "Amazon Product Search Results",
+ "baseSelector": "[data-component-type='s-search-result']",
+ "fields": [
+ {
+ "name": "asin",
+ "selector": "",
+ "type": "attribute",
+ "attribute": "data-asin"
+ },
+ {
+ "name": "title",
+ "selector": "h2 a span",
+ "type": "text"
+ },
+ {
+ "name": "url",
+ "selector": "h2 a",
+ "type": "attribute",
+ "attribute": "href"
+ },
+ {
+ "name": "image",
+ "selector": ".s-image",
+ "type": "attribute",
+ "attribute": "src"
+ },
+ {
+ "name": "rating",
+ "selector": ".a-icon-star-small .a-icon-alt",
+ "type": "text"
+ },
+ {
+ "name": "reviews_count",
+ "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
+ "type": "text"
+ },
+ {
+ "name": "price",
+ "selector": ".a-price .a-offscreen",
+ "type": "text"
+ },
+ {
+ "name": "original_price",
+ "selector": ".a-price.a-text-price .a-offscreen",
+ "type": "text"
+ },
+ {
+ "name": "sponsored",
+ "selector": ".puis-sponsored-label-text",
+ "type": "exists"
+ },
+ {
+ "name": "delivery_info",
+ "selector": "[data-cy='delivery-recipe'] .a-color-base",
+ "type": "text",
+ "multiple": True
+ }
+ ]
+ }
+ )
+ )
+
+ # Example search URL (you should replace with your actual Amazon URL)
+ url = "https://www.amazon.com/"
+
+
+ # Use context manager for proper resource handling
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ # Extract the data
+ result = await crawler.arun(url=url, config=crawler_config)
+
+ # Process and print the results
+ if result and result.extracted_content:
+ # Parse the JSON string into a list of products
+ products = json.loads(result.extracted_content)
+
+ # Process each product in the list
+ for product in products:
+ print("\nProduct Details:")
+ print(f"ASIN: {product.get('asin')}")
+ print(f"Title: {product.get('title')}")
+ print(f"Price: {product.get('price')}")
+ print(f"Original Price: {product.get('original_price')}")
+ print(f"Rating: {product.get('rating')}")
+ print(f"Reviews: {product.get('reviews_count')}")
+ print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
+ if product.get('delivery_info'):
+ print(f"Delivery: {' '.join(product['delivery_info'])}")
+ print("-" * 80)
+
+if __name__ == "__main__":
+ import asyncio
+ asyncio.run(extract_amazon_products())
diff --git a/docs/examples/assets/audio.mp3 b/docs/examples/assets/audio.mp3
new file mode 100644
index 0000000000000000000000000000000000000000..299149c6dec722f2a7274c2894bdde12d31107ef
Binary files /dev/null and b/docs/examples/assets/audio.mp3 differ
diff --git a/docs/examples/assets/basic.png b/docs/examples/assets/basic.png
new file mode 100644
index 0000000000000000000000000000000000000000..ea68852bb48ff94699e35e93becb045776d0085f
Binary files /dev/null and b/docs/examples/assets/basic.png differ
diff --git a/docs/examples/assets/cosine_extraction.png b/docs/examples/assets/cosine_extraction.png
new file mode 100644
index 0000000000000000000000000000000000000000..19252ad44da8dcefd409ce643e1f46f20e8d15a3
Binary files /dev/null and b/docs/examples/assets/cosine_extraction.png differ
diff --git a/docs/examples/assets/css_js.png b/docs/examples/assets/css_js.png
new file mode 100644
index 0000000000000000000000000000000000000000..9c0d2e60fef6a850badf251717c8bced2944ca36
Binary files /dev/null and b/docs/examples/assets/css_js.png differ
diff --git a/docs/examples/assets/css_selector.png b/docs/examples/assets/css_selector.png
new file mode 100644
index 0000000000000000000000000000000000000000..39357bb920182744dfce5509b5f289829eca2aba
Binary files /dev/null and b/docs/examples/assets/css_selector.png differ
diff --git a/docs/examples/assets/exec_script.png b/docs/examples/assets/exec_script.png
new file mode 100644
index 0000000000000000000000000000000000000000..c2e478f70984ba3445629a793a7424a1feab8a64
Binary files /dev/null and b/docs/examples/assets/exec_script.png differ
diff --git a/docs/examples/assets/llm_extraction.png b/docs/examples/assets/llm_extraction.png
new file mode 100644
index 0000000000000000000000000000000000000000..95d2accb9f1d66a6e6f614cf6c15f103962e6476
Binary files /dev/null and b/docs/examples/assets/llm_extraction.png differ
diff --git a/docs/examples/assets/semantic_extraction_cosine.png b/docs/examples/assets/semantic_extraction_cosine.png
new file mode 100644
index 0000000000000000000000000000000000000000..eace4cf502eda675abcc3be9bd790a210a344538
Binary files /dev/null and b/docs/examples/assets/semantic_extraction_cosine.png differ
diff --git a/docs/examples/assets/semantic_extraction_llm.png b/docs/examples/assets/semantic_extraction_llm.png
new file mode 100644
index 0000000000000000000000000000000000000000..1dba8bc6f73cb361942ac24faed85c042f5f474b
Binary files /dev/null and b/docs/examples/assets/semantic_extraction_llm.png differ
diff --git a/docs/examples/async_webcrawler_multiple_urls_example.py b/docs/examples/async_webcrawler_multiple_urls_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d63ac80f364c42ed37c24b902fe46dfd4113538
--- /dev/null
+++ b/docs/examples/async_webcrawler_multiple_urls_example.py
@@ -0,0 +1,48 @@
+# File: async_webcrawler_multiple_urls_example.py
+import os, sys
+# append 2 parent directories to sys.path to import crawl4ai
+parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(parent_dir)
+
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+ # Initialize the AsyncWebCrawler
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ # List of URLs to crawl
+ urls = [
+ "https://example.com",
+ "https://python.org",
+ "https://github.com",
+ "https://stackoverflow.com",
+ "https://news.ycombinator.com"
+ ]
+
+ # Set up crawling parameters
+ word_count_threshold = 100
+
+ # Run the crawling process for multiple URLs
+ results = await crawler.arun_many(
+ urls=urls,
+ word_count_threshold=word_count_threshold,
+ bypass_cache=True,
+ verbose=True
+ )
+
+ # Process the results
+ for result in results:
+ if result.success:
+ print(f"Successfully crawled: {result.url}")
+ print(f"Title: {result.metadata.get('title', 'N/A')}")
+ print(f"Word count: {len(result.markdown.split())}")
+ print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}")
+ print(f"Number of images: {len(result.media.get('images', []))}")
+ print("---")
+ else:
+ print(f"Failed to crawl: {result.url}")
+ print(f"Error: {result.error_message}")
+ print("---")
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/docs/examples/browser_optimization_example.py b/docs/examples/browser_optimization_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..f57dc14782172e640fde80d2500d5a2aef9d2b26
--- /dev/null
+++ b/docs/examples/browser_optimization_example.py
@@ -0,0 +1,128 @@
+"""
+This example demonstrates optimal browser usage patterns in Crawl4AI:
+1. Sequential crawling with session reuse
+2. Parallel crawling with browser instance reuse
+3. Performance optimization settings
+"""
+
+import asyncio
+import os
+from typing import List
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+
+async def crawl_sequential(urls: List[str]):
+ """
+ Sequential crawling using session reuse - most efficient for moderate workloads
+ """
+ print("\n=== Sequential Crawling with Session Reuse ===")
+
+ # Configure browser with optimized settings
+ browser_config = BrowserConfig(
+ headless=True,
+ browser_args=[
+ "--disable-gpu", # Disable GPU acceleration
+ "--disable-dev-shm-usage", # Disable /dev/shm usage
+ "--no-sandbox", # Required for Docker
+ ],
+ viewport={
+ "width": 800,
+ "height": 600,
+ }, # Smaller viewport for better performance
+ )
+
+ # Configure crawl settings
+ crawl_config = CrawlerRunConfig(
+ markdown_generator=DefaultMarkdownGenerator(
+ # content_filter=PruningContentFilter(), In case you need fit_markdown
+ ),
+ )
+
+ # Create single crawler instance
+ crawler = AsyncWebCrawler(config=browser_config)
+ await crawler.start()
+
+ try:
+ session_id = "session1" # Use same session for all URLs
+ for url in urls:
+ result = await crawler.arun(
+ url=url,
+ config=crawl_config,
+ session_id=session_id, # Reuse same browser tab
+ )
+ if result.success:
+ print(f"Successfully crawled {url}")
+ print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
+ finally:
+ await crawler.close()
+
+
+async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
+ """
+ Parallel crawling while reusing browser instance - best for large workloads
+ """
+ print("\n=== Parallel Crawling with Browser Reuse ===")
+
+ browser_config = BrowserConfig(
+ headless=True,
+ browser_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
+ viewport={"width": 800, "height": 600},
+ )
+
+ crawl_config = CrawlerRunConfig(
+ markdown_generator=DefaultMarkdownGenerator(
+ # content_filter=PruningContentFilter(), In case you need fit_markdown
+ ),
+ )
+
+ # Create single crawler instance for all parallel tasks
+ crawler = AsyncWebCrawler(config=browser_config)
+ await crawler.start()
+
+ try:
+ # Create tasks in batches to control concurrency
+ for i in range(0, len(urls), max_concurrent):
+ batch = urls[i : i + max_concurrent]
+ tasks = []
+
+ for j, url in enumerate(batch):
+ session_id = (
+ f"parallel_session_{j}" # Different session per concurrent task
+ )
+ task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
+ tasks.append(task)
+
+ # Wait for batch to complete
+ results = await asyncio.gather(*tasks, return_exceptions=True)
+
+ # Process results
+ for url, result in zip(batch, results):
+ if isinstance(result, Exception):
+ print(f"Error crawling {url}: {str(result)}")
+ elif result.success:
+ print(f"Successfully crawled {url}")
+ print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
+ finally:
+ await crawler.close()
+
+
+async def main():
+ # Example URLs
+ urls = [
+ "https://example.com/page1",
+ "https://example.com/page2",
+ "https://example.com/page3",
+ "https://example.com/page4",
+ ]
+
+ # Demo sequential crawling
+ await crawl_sequential(urls)
+
+ # Demo parallel crawling
+ await crawl_parallel(urls, max_concurrent=2)
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/docs/examples/chainlit.md b/docs/examples/chainlit.md
new file mode 100644
index 0000000000000000000000000000000000000000..3b34b02f459a4e1bb038a0abc8cf5b59f8d80b76
--- /dev/null
+++ b/docs/examples/chainlit.md
@@ -0,0 +1,3 @@
+# Welcome to Crawl4AI! 🚀🤖
+
+Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context.
\ No newline at end of file
diff --git a/docs/examples/crawlai_vs_firecrawl.py b/docs/examples/crawlai_vs_firecrawl.py
new file mode 100644
index 0000000000000000000000000000000000000000..b50b06dac8e1b7218ff63bd76fb28ee153d1c47e
--- /dev/null
+++ b/docs/examples/crawlai_vs_firecrawl.py
@@ -0,0 +1,67 @@
+import os, time
+# append the path to the root of the project
+import sys
+import asyncio
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
+from firecrawl import FirecrawlApp
+from crawl4ai import AsyncWebCrawler
+__data__ = os.path.join(os.path.dirname(__file__), '..', '..') + '/.data'
+
+async def compare():
+ app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
+
+ # Tet Firecrawl with a simple crawl
+ start = time.time()
+ scrape_status = app.scrape_url(
+ 'https://www.nbcnews.com/business',
+ params={'formats': ['markdown', 'html']}
+ )
+ end = time.time()
+ print(f"Time taken: {end - start} seconds")
+ print(len(scrape_status['markdown']))
+ # save the markdown content with provider name
+ with open(f"{__data__}/firecrawl_simple.md", "w") as f:
+ f.write(scrape_status['markdown'])
+ # Count how many "cldnry.s-nbcnews.com" are in the markdown
+ print(scrape_status['markdown'].count("cldnry.s-nbcnews.com"))
+
+
+
+ async with AsyncWebCrawler() as crawler:
+ start = time.time()
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business",
+ # js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
+ word_count_threshold=0,
+ bypass_cache=True,
+ verbose=False
+ )
+ end = time.time()
+ print(f"Time taken: {end - start} seconds")
+ print(len(result.markdown))
+ # save the markdown content with provider name
+ with open(f"{__data__}/crawl4ai_simple.md", "w") as f:
+ f.write(result.markdown)
+ # count how many "cldnry.s-nbcnews.com" are in the markdown
+ print(result.markdown.count("cldnry.s-nbcnews.com"))
+
+ start = time.time()
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business",
+ js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
+ word_count_threshold=0,
+ bypass_cache=True,
+ verbose=False
+ )
+ end = time.time()
+ print(f"Time taken: {end - start} seconds")
+ print(len(result.markdown))
+ # save the markdown content with provider name
+ with open(f"{__data__}/crawl4ai_js.md", "w") as f:
+ f.write(result.markdown)
+ # count how many "cldnry.s-nbcnews.com" are in the markdown
+ print(result.markdown.count("cldnry.s-nbcnews.com"))
+
+if __name__ == "__main__":
+ asyncio.run(compare())
+
\ No newline at end of file
diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..48acc80995c44493d66f4ccfc4df721543113066
--- /dev/null
+++ b/docs/examples/docker_example.py
@@ -0,0 +1,357 @@
+import requests
+import json
+import time
+import sys
+import base64
+import os
+from typing import Dict, Any
+
+class Crawl4AiTester:
+ def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
+ self.base_url = base_url
+ self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" # Check environment variable as fallback
+ self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
+
+ def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
+ # Submit crawl job
+ response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers)
+ if response.status_code == 403:
+ raise Exception("API token is invalid or missing")
+ task_id = response.json()["task_id"]
+ print(f"Task ID: {task_id}")
+
+ # Poll for result
+ start_time = time.time()
+ while True:
+ if time.time() - start_time > timeout:
+ raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
+
+ result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers)
+ status = result.json()
+
+ if status["status"] == "failed":
+ print("Task failed:", status.get("error"))
+ raise Exception(f"Task failed: {status.get('error')}")
+
+ if status["status"] == "completed":
+ return status
+
+ time.sleep(2)
+
+ def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
+ response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60)
+ if response.status_code == 408:
+ raise TimeoutError("Task did not complete within server timeout")
+ response.raise_for_status()
+ return response.json()
+
+ def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
+ """Directly crawl without using task queue"""
+ response = requests.post(
+ f"{self.base_url}/crawl_direct",
+ json=request_data,
+ headers=self.headers
+ )
+ response.raise_for_status()
+ return response.json()
+
+def test_docker_deployment(version="basic"):
+ tester = Crawl4AiTester(
+ base_url="http://localhost:11235" ,
+ # base_url="https://api.crawl4ai.com" # just for example
+ # api_token="test" # just for example
+ )
+ print(f"Testing Crawl4AI Docker {version} version")
+
+ # Health check with timeout and retry
+ max_retries = 5
+ for i in range(max_retries):
+ try:
+ health = requests.get(f"{tester.base_url}/health", timeout=10)
+ print("Health check:", health.json())
+ break
+ except requests.exceptions.RequestException as e:
+ if i == max_retries - 1:
+ print(f"Failed to connect after {max_retries} attempts")
+ sys.exit(1)
+ print(f"Waiting for service to start (attempt {i+1}/{max_retries})...")
+ time.sleep(5)
+
+ # Test cases based on version
+ test_basic_crawl_direct(tester)
+ test_basic_crawl(tester)
+ test_basic_crawl(tester)
+ test_basic_crawl_sync(tester)
+
+ if version in ["full", "transformer"]:
+ test_cosine_extraction(tester)
+
+ test_js_execution(tester)
+ test_css_selector(tester)
+ test_structured_extraction(tester)
+ test_llm_extraction(tester)
+ test_llm_with_ollama(tester)
+ test_screenshot(tester)
+
+
+def test_basic_crawl(tester: Crawl4AiTester):
+ print("\n=== Testing Basic Crawl ===")
+ request = {
+ "urls": "https://www.nbcnews.com/business",
+ "priority": 10,
+ "session_id": "test"
+ }
+
+ result = tester.submit_and_wait(request)
+ print(f"Basic crawl result length: {len(result['result']['markdown'])}")
+ assert result["result"]["success"]
+ assert len(result["result"]["markdown"]) > 0
+
+def test_basic_crawl_sync(tester: Crawl4AiTester):
+ print("\n=== Testing Basic Crawl (Sync) ===")
+ request = {
+ "urls": "https://www.nbcnews.com/business",
+ "priority": 10,
+ "session_id": "test"
+ }
+
+ result = tester.submit_sync(request)
+ print(f"Basic crawl result length: {len(result['result']['markdown'])}")
+ assert result['status'] == 'completed'
+ assert result['result']['success']
+ assert len(result['result']['markdown']) > 0
+
+def test_basic_crawl_direct(tester: Crawl4AiTester):
+ print("\n=== Testing Basic Crawl (Direct) ===")
+ request = {
+ "urls": "https://www.nbcnews.com/business",
+ "priority": 10,
+ # "session_id": "test"
+ "cache_mode": "bypass" # or "enabled", "disabled", "read_only", "write_only"
+ }
+
+ result = tester.crawl_direct(request)
+ print(f"Basic crawl result length: {len(result['result']['markdown'])}")
+ assert result['result']['success']
+ assert len(result['result']['markdown']) > 0
+
+def test_js_execution(tester: Crawl4AiTester):
+ print("\n=== Testing JS Execution ===")
+ request = {
+ "urls": "https://www.nbcnews.com/business",
+ "priority": 8,
+ "js_code": [
+ "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
+ ],
+ "wait_for": "article.tease-card:nth-child(10)",
+ "crawler_params": {
+ "headless": True
+ }
+ }
+
+ result = tester.submit_and_wait(request)
+ print(f"JS execution result length: {len(result['result']['markdown'])}")
+ assert result["result"]["success"]
+
+def test_css_selector(tester: Crawl4AiTester):
+ print("\n=== Testing CSS Selector ===")
+ request = {
+ "urls": "https://www.nbcnews.com/business",
+ "priority": 7,
+ "css_selector": ".wide-tease-item__description",
+ "crawler_params": {
+ "headless": True
+ },
+ "extra": {"word_count_threshold": 10}
+
+ }
+
+ result = tester.submit_and_wait(request)
+ print(f"CSS selector result length: {len(result['result']['markdown'])}")
+ assert result["result"]["success"]
+
+def test_structured_extraction(tester: Crawl4AiTester):
+ print("\n=== Testing Structured Extraction ===")
+ schema = {
+ "name": "Coinbase Crypto Prices",
+ "baseSelector": ".cds-tableRow-t45thuk",
+ "fields": [
+ {
+ "name": "crypto",
+ "selector": "td:nth-child(1) h2",
+ "type": "text",
+ },
+ {
+ "name": "symbol",
+ "selector": "td:nth-child(1) p",
+ "type": "text",
+ },
+ {
+ "name": "price",
+ "selector": "td:nth-child(2)",
+ "type": "text",
+ }
+ ],
+ }
+
+ request = {
+ "urls": "https://www.coinbase.com/explore",
+ "priority": 9,
+ "extraction_config": {
+ "type": "json_css",
+ "params": {
+ "schema": schema
+ }
+ }
+ }
+
+ result = tester.submit_and_wait(request)
+ extracted = json.loads(result["result"]["extracted_content"])
+ print(f"Extracted {len(extracted)} items")
+ print("Sample item:", json.dumps(extracted[0], indent=2))
+ assert result["result"]["success"]
+ assert len(extracted) > 0
+
+def test_llm_extraction(tester: Crawl4AiTester):
+ print("\n=== Testing LLM Extraction ===")
+ schema = {
+ "type": "object",
+ "properties": {
+ "model_name": {
+ "type": "string",
+ "description": "Name of the OpenAI model."
+ },
+ "input_fee": {
+ "type": "string",
+ "description": "Fee for input token for the OpenAI model."
+ },
+ "output_fee": {
+ "type": "string",
+ "description": "Fee for output token for the OpenAI model."
+ }
+ },
+ "required": ["model_name", "input_fee", "output_fee"]
+ }
+
+ request = {
+ "urls": "https://openai.com/api/pricing",
+ "priority": 8,
+ "extraction_config": {
+ "type": "llm",
+ "params": {
+ "provider": "openai/gpt-4o-mini",
+ "api_token": os.getenv("OPENAI_API_KEY"),
+ "schema": schema,
+ "extraction_type": "schema",
+ "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
+ }
+ },
+ "crawler_params": {"word_count_threshold": 1}
+ }
+
+ try:
+ result = tester.submit_and_wait(request)
+ extracted = json.loads(result["result"]["extracted_content"])
+ print(f"Extracted {len(extracted)} model pricing entries")
+ print("Sample entry:", json.dumps(extracted[0], indent=2))
+ assert result["result"]["success"]
+ except Exception as e:
+ print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")
+
+def test_llm_with_ollama(tester: Crawl4AiTester):
+ print("\n=== Testing LLM with Ollama ===")
+ schema = {
+ "type": "object",
+ "properties": {
+ "article_title": {
+ "type": "string",
+ "description": "The main title of the news article"
+ },
+ "summary": {
+ "type": "string",
+ "description": "A brief summary of the article content"
+ },
+ "main_topics": {
+ "type": "array",
+ "items": {"type": "string"},
+ "description": "Main topics or themes discussed in the article"
+ }
+ }
+ }
+
+ request = {
+ "urls": "https://www.nbcnews.com/business",
+ "priority": 8,
+ "extraction_config": {
+ "type": "llm",
+ "params": {
+ "provider": "ollama/llama2",
+ "schema": schema,
+ "extraction_type": "schema",
+ "instruction": "Extract the main article information including title, summary, and main topics."
+ }
+ },
+ "extra": {"word_count_threshold": 1},
+ "crawler_params": {"verbose": True}
+ }
+
+ try:
+ result = tester.submit_and_wait(request)
+ extracted = json.loads(result["result"]["extracted_content"])
+ print("Extracted content:", json.dumps(extracted, indent=2))
+ assert result["result"]["success"]
+ except Exception as e:
+ print(f"Ollama extraction test failed: {str(e)}")
+
+def test_cosine_extraction(tester: Crawl4AiTester):
+ print("\n=== Testing Cosine Extraction ===")
+ request = {
+ "urls": "https://www.nbcnews.com/business",
+ "priority": 8,
+ "extraction_config": {
+ "type": "cosine",
+ "params": {
+ "semantic_filter": "business finance economy",
+ "word_count_threshold": 10,
+ "max_dist": 0.2,
+ "top_k": 3
+ }
+ }
+ }
+
+ try:
+ result = tester.submit_and_wait(request)
+ extracted = json.loads(result["result"]["extracted_content"])
+ print(f"Extracted {len(extracted)} text clusters")
+ print("First cluster tags:", extracted[0]["tags"])
+ assert result["result"]["success"]
+ except Exception as e:
+ print(f"Cosine extraction test failed: {str(e)}")
+
+def test_screenshot(tester: Crawl4AiTester):
+ print("\n=== Testing Screenshot ===")
+ request = {
+ "urls": "https://www.nbcnews.com/business",
+ "priority": 5,
+ "screenshot": True,
+ "crawler_params": {
+ "headless": True
+ }
+ }
+
+ result = tester.submit_and_wait(request)
+ print("Screenshot captured:", bool(result["result"]["screenshot"]))
+
+ if result["result"]["screenshot"]:
+ # Save screenshot
+ screenshot_data = base64.b64decode(result["result"]["screenshot"])
+ with open("test_screenshot.jpg", "wb") as f:
+ f.write(screenshot_data)
+ print("Screenshot saved as test_screenshot.jpg")
+
+ assert result["result"]["success"]
+
+if __name__ == "__main__":
+ version = sys.argv[1] if len(sys.argv) > 1 else "basic"
+ # version = "full"
+ test_docker_deployment(version)
\ No newline at end of file
diff --git a/docs/examples/extraction_strategies_example.py b/docs/examples/extraction_strategies_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..348b891ee1290e8c5e10a8d1c53b49cd02ac7606
--- /dev/null
+++ b/docs/examples/extraction_strategies_example.py
@@ -0,0 +1,115 @@
+"""
+Example demonstrating different extraction strategies with various input formats.
+This example shows how to:
+1. Use different input formats (markdown, HTML, fit_markdown)
+2. Work with JSON-based extractors (CSS and XPath)
+3. Use LLM-based extraction with different input formats
+4. Configure browser and crawler settings properly
+"""
+
+import asyncio
+import os
+from typing import Dict, Any
+
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import (
+ LLMExtractionStrategy,
+ JsonCssExtractionStrategy,
+ JsonXPathExtractionStrategy
+)
+from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
+ """Helper function to run extraction with proper configuration"""
+ try:
+ # Configure the crawler run settings
+ config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ extraction_strategy=strategy,
+ markdown_generator=DefaultMarkdownGenerator(
+ content_filter=PruningContentFilter() # For fit_markdown support
+ )
+ )
+
+ # Run the crawler
+ result = await crawler.arun(url=url, config=config)
+
+ if result.success:
+ print(f"\n=== {name} Results ===")
+ print(f"Extracted Content: {result.extracted_content}")
+ print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
+ print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}")
+ else:
+ print(f"Error in {name}: Crawl failed")
+
+ except Exception as e:
+ print(f"Error in {name}: {str(e)}")
+
+async def main():
+ # Example URL (replace with actual URL)
+ url = "https://example.com/product-page"
+
+ # Configure browser settings
+ browser_config = BrowserConfig(
+ headless=True,
+ verbose=True
+ )
+
+ # Initialize extraction strategies
+
+ # 1. LLM Extraction with different input formats
+ markdown_strategy = LLMExtractionStrategy(
+ provider="openai/gpt-4o-mini",
+ api_token=os.getenv("OPENAI_API_KEY"),
+ instruction="Extract product information including name, price, and description"
+ )
+
+ html_strategy = LLMExtractionStrategy(
+ input_format="html",
+ provider="openai/gpt-4o-mini",
+ api_token=os.getenv("OPENAI_API_KEY"),
+ instruction="Extract product information from HTML including structured data"
+ )
+
+ fit_markdown_strategy = LLMExtractionStrategy(
+ input_format="fit_markdown",
+ provider="openai/gpt-4o-mini",
+ api_token=os.getenv("OPENAI_API_KEY"),
+ instruction="Extract product information from cleaned markdown"
+ )
+
+ # 2. JSON CSS Extraction (automatically uses HTML input)
+ css_schema = {
+ "baseSelector": ".product",
+ "fields": [
+ {"name": "title", "selector": "h1.product-title", "type": "text"},
+ {"name": "price", "selector": ".price", "type": "text"},
+ {"name": "description", "selector": ".description", "type": "text"}
+ ]
+ }
+ css_strategy = JsonCssExtractionStrategy(schema=css_schema)
+
+ # 3. JSON XPath Extraction (automatically uses HTML input)
+ xpath_schema = {
+ "baseSelector": "//div[@class='product']",
+ "fields": [
+ {"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"},
+ {"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"},
+ {"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"}
+ ]
+ }
+ xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)
+
+ # Use context manager for proper resource handling
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ # Run all strategies
+ await run_extraction(crawler, url, markdown_strategy, "Markdown LLM")
+ await run_extraction(crawler, url, html_strategy, "HTML LLM")
+ await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM")
+ await run_extraction(crawler, url, css_strategy, "CSS Extraction")
+ await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/docs/examples/full_page_screenshot_and_pdf_export.md b/docs/examples/full_page_screenshot_and_pdf_export.md
new file mode 100644
index 0000000000000000000000000000000000000000..8522675c3537b29fc1fbbbb5b8aa378e1974d78a
--- /dev/null
+++ b/docs/examples/full_page_screenshot_and_pdf_export.md
@@ -0,0 +1,58 @@
+# Capturing Full-Page Screenshots and PDFs from Massive Webpages with Crawl4AI
+
+When dealing with very long web pages, traditional full-page screenshots can be slow or fail entirely. For large pages (like extensive Wikipedia articles), generating a single massive screenshot often leads to delays, memory issues, or style differences.
+
+**The New Approach:**
+We’ve introduced a new feature that effortlessly handles even the biggest pages by first exporting them as a PDF, then converting that PDF into a high-quality image. This approach leverages the browser’s built-in PDF rendering, making it both stable and efficient for very long content. You also have the option to directly save the PDF for your own usage—no need for multiple passes or complex stitching logic.
+
+**Key Benefits:**
+- **Reliability:** The PDF export never times out and works regardless of page length.
+- **Versatility:** Get both the PDF and a screenshot in one crawl, without reloading or reprocessing.
+- **Performance:** Skips manual scrolling and stitching images, reducing complexity and runtime.
+
+**Simple Example:**
+```python
+import os, sys
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+
+# Adjust paths as needed
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+async def main():
+ async with AsyncWebCrawler() as crawler:
+ # Request both PDF and screenshot
+ result = await crawler.arun(
+ url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
+ cache_mode=CacheMode.BYPASS,
+ pdf=True,
+ screenshot=True
+ )
+
+ if result.success:
+ # Save screenshot
+ if result.screenshot:
+ from base64 import b64decode
+ with open(os.path.join(__location__, "screenshot.png"), "wb") as f:
+ f.write(b64decode(result.screenshot))
+
+ # Save PDF
+ if result.pdf:
+ pdf_bytes = b64decode(result.pdf)
+ with open(os.path.join(__location__, "page.pdf"), "wb") as f:
+ f.write(pdf_bytes)
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+**What Happens Under the Hood:**
+- Crawl4AI navigates to the target page.
+- If `pdf=True`, it exports the current page as a full PDF, capturing all of its content no matter the length.
+- If `screenshot=True`, and a PDF is already available, it directly converts the first page of that PDF to an image for you—no repeated loading or scrolling.
+- Finally, you get your PDF and/or screenshot ready to use.
+
+**Conclusion:**
+With this feature, Crawl4AI becomes even more robust and versatile for large-scale content extraction. Whether you need a PDF snapshot or a quick screenshot, you now have a reliable solution for even the most extensive webpages.
\ No newline at end of file
diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py
new file mode 100644
index 0000000000000000000000000000000000000000..18534d0e09ad7c6054cfc91ccd4c694143691530
--- /dev/null
+++ b/docs/examples/hello_world.py
@@ -0,0 +1,20 @@
+import asyncio
+from crawl4ai import *
+
+async def main():
+ browser_config = BrowserConfig(headless=True, verbose=True)
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(
+ content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
+ )
+ )
+ result = await crawler.arun(
+ url="https://www.helloworld.org",
+ config=crawler_config
+ )
+ print(result.markdown_v2.raw_markdown[:500])
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/docs/examples/hooks_example.py b/docs/examples/hooks_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..09e0bc17d204b0c9f14eea6fc1f559eebf11720a
--- /dev/null
+++ b/docs/examples/hooks_example.py
@@ -0,0 +1,107 @@
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from playwright.async_api import Page, BrowserContext
+
+async def main():
+ print("🔗 Hooks Example: Demonstrating different hook use cases")
+
+ # Configure browser settings
+ browser_config = BrowserConfig(
+ headless=True
+ )
+
+ # Configure crawler settings
+ crawler_run_config = CrawlerRunConfig(
+ js_code="window.scrollTo(0, document.body.scrollHeight);",
+ wait_for="body",
+ cache_mode=CacheMode.BYPASS
+ )
+
+ # Create crawler instance
+ crawler = AsyncWebCrawler(config=browser_config)
+
+ # Define and set hook functions
+ async def on_browser_created(browser, context: BrowserContext, **kwargs):
+ """Hook called after the browser is created"""
+ print("[HOOK] on_browser_created - Browser is ready!")
+ # Example: Set a cookie that will be used for all requests
+ return browser
+
+ async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
+ """Hook called after a new page and context are created"""
+ print("[HOOK] on_page_context_created - New page created!")
+ # Example: Set default viewport size
+ await context.add_cookies([{
+ 'name': 'session_id',
+ 'value': 'example_session',
+ 'domain': '.example.com',
+ 'path': '/'
+ }])
+ await page.set_viewport_size({"width": 1920, "height": 1080})
+ return page
+
+ async def on_user_agent_updated(page: Page, context: BrowserContext, user_agent: str, **kwargs):
+ """Hook called when the user agent is updated"""
+ print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
+ return page
+
+ async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
+ """Hook called after custom JavaScript execution"""
+ print("[HOOK] on_execution_started - Custom JS executed!")
+ return page
+
+ async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs):
+ """Hook called before navigating to each URL"""
+ print(f"[HOOK] before_goto - About to visit: {url}")
+ # Example: Add custom headers for the request
+ await page.set_extra_http_headers({
+ "Custom-Header": "my-value"
+ })
+ return page
+
+ async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
+ """Hook called after navigating to each URL"""
+ print(f"[HOOK] after_goto - Successfully loaded: {url}")
+ # Example: Wait for a specific element to be loaded
+ try:
+ await page.wait_for_selector('.content', timeout=1000)
+ print("Content element found!")
+ except:
+ print("Content element not found, continuing anyway")
+ return page
+
+ async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
+ """Hook called before retrieving the HTML content"""
+ print("[HOOK] before_retrieve_html - About to get HTML content")
+ # Example: Scroll to bottom to trigger lazy loading
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+ return page
+
+ async def before_return_html(page: Page, context: BrowserContext, html:str, **kwargs):
+ """Hook called before returning the HTML content"""
+ print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})")
+ # Example: You could modify the HTML content here if needed
+ return page
+
+ # Set all the hooks
+ crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
+ crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
+ crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated)
+ crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+ crawler.crawler_strategy.set_hook("before_goto", before_goto)
+ crawler.crawler_strategy.set_hook("after_goto", after_goto)
+ crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html)
+ crawler.crawler_strategy.set_hook("before_return_html", before_return_html)
+
+ await crawler.start()
+
+ # Example usage: crawl a simple website
+ url = 'https://example.com'
+ result = await crawler.arun(url, config=crawler_run_config)
+ print(f"\nCrawled URL: {result.url}")
+ print(f"HTML length: {len(result.html)}")
+
+ await crawler.close()
+
+if __name__ == "__main__":
+ import asyncio
+ asyncio.run(main())
\ No newline at end of file
diff --git a/docs/examples/language_support_example.py b/docs/examples/language_support_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..b74a8402e81263edc83e2ecb04454bcd4774f52b
--- /dev/null
+++ b/docs/examples/language_support_example.py
@@ -0,0 +1,45 @@
+import asyncio
+from crawl4ai import AsyncWebCrawler, AsyncPlaywrightCrawlerStrategy
+
+async def main():
+ # Example 1: Setting language when creating the crawler
+ crawler1 = AsyncWebCrawler(
+ crawler_strategy=AsyncPlaywrightCrawlerStrategy(
+ headers={"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7"}
+ )
+ )
+ result1 = await crawler1.arun("https://www.example.com")
+ print("Example 1 result:", result1.extracted_content[:100]) # Print first 100 characters
+
+ # Example 2: Setting language before crawling
+ crawler2 = AsyncWebCrawler()
+ crawler2.crawler_strategy.headers["Accept-Language"] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7"
+ result2 = await crawler2.arun("https://www.example.com")
+ print("Example 2 result:", result2.extracted_content[:100])
+
+ # Example 3: Setting language when calling arun method
+ crawler3 = AsyncWebCrawler()
+ result3 = await crawler3.arun(
+ "https://www.example.com",
+ headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"}
+ )
+ print("Example 3 result:", result3.extracted_content[:100])
+
+ # Example 4: Crawling multiple pages with different languages
+ urls = [
+ ("https://www.example.com", "fr-FR,fr;q=0.9"),
+ ("https://www.example.org", "es-ES,es;q=0.9"),
+ ("https://www.example.net", "de-DE,de;q=0.9"),
+ ]
+
+ crawler4 = AsyncWebCrawler()
+ results = await asyncio.gather(*[
+ crawler4.arun(url, headers={"Accept-Language": lang})
+ for url, lang in urls
+ ])
+
+ for url, result in zip([u for u, _ in urls], results):
+ print(f"Result for {url}:", result.extracted_content[:100])
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ae3d4d1f68e96d85483468bee24bd084599bf26
--- /dev/null
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -0,0 +1,40 @@
+from crawl4ai.extraction_strategy import *
+from crawl4ai.crawler_strategy import *
+import asyncio
+from pydantic import BaseModel, Field
+
+url = r'https://openai.com/api/pricing/'
+
+class OpenAIModelFee(BaseModel):
+ model_name: str = Field(..., description="Name of the OpenAI model.")
+ input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+ output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
+
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+ # Use AsyncWebCrawler
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url=url,
+ word_count_threshold=1,
+ extraction_strategy= LLMExtractionStrategy(
+ # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
+ provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'),
+ schema=OpenAIModelFee.model_json_schema(),
+ extraction_type="schema",
+ instruction="From the crawled content, extract all mentioned model names along with their " \
+ "fees for input and output tokens. Make sure not to miss anything in the entire content. " \
+ 'One extracted model JSON format should look like this: ' \
+ '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
+ ),
+
+ )
+ print("Success:", result.success)
+ model_fees = json.loads(result.extracted_content)
+ print(len(model_fees))
+
+ with open(".data/data.json", "w", encoding="utf-8") as f:
+ f.write(result.extracted_content)
+
+asyncio.run(main())
diff --git a/docs/examples/quickstart.ipynb b/docs/examples/quickstart.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..4751dec8b9c2c55df265d4f1440eec0a14c9ddb1
--- /dev/null
+++ b/docs/examples/quickstart.ipynb
@@ -0,0 +1,664 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "0cba38e5",
+ "metadata": {},
+ "source": [
+ "# Crawl4AI 🕷️🤖\n",
+ " \n",
+ "\n",
+ "[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)\n",
+ "![PyPI - Downloads](https://img.shields.io/pypi/dm/Crawl4AI)\n",
+ "[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)\n",
+ "[![GitHub Issues](https://img.shields.io/github/issues/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/issues)\n",
+ "[![GitHub Pull Requests](https://img.shields.io/github/issues-pr/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/pulls)\n",
+ "[![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)\n",
+ "\n",
+ "Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐\n",
+ "\n",
+ "- GitHub Repository: [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)\n",
+ "- Twitter: [@unclecode](https://twitter.com/unclecode)\n",
+ "- Website: [https://crawl4ai.com](https://crawl4ai.com)\n",
+ "\n",
+ "## 🌟 Meet the Crawl4AI Assistant: Your Copilot for Crawling\n",
+ "Use the [Crawl4AI GPT Assistant](https://tinyurl.com/crawl4ai-gpt) as your AI-powered copilot! With this assistant, you can:\n",
+ "- 🧑💻 Generate code for complex crawling and extraction tasks\n",
+ "- 💡 Get tailored support and examples\n",
+ "- 📘 Learn Crawl4AI faster with step-by-step guidance"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "41de6458",
+ "metadata": {},
+ "source": [
+ "### **Quickstart with Crawl4AI**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1380e951",
+ "metadata": {},
+ "source": [
+ "#### 1. **Installation**\n",
+ "Install Crawl4AI and necessary dependencies:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "05fecfad",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# %%capture\n",
+ "!pip install crawl4ai\n",
+ "!pip install nest_asyncio\n",
+ "!playwright install "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "2c2a74c8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import asyncio\n",
+ "import nest_asyncio\n",
+ "nest_asyncio.apply()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f3c558d7",
+ "metadata": {},
+ "source": [
+ "#### 2. **Basic Setup and Simple Crawl**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "003376f3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 1.49 seconds\n",
+ "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.10 seconds\n",
+ "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
+ "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.10 seconds.\n",
+ "IE 11 is not supported. For an optimal experience visit our site on another browser.\n",
+ "\n",
+ "[Morning Rundown: Trump and Harris' vastly different closing pitches, why Kim Jong Un is helping Russia, and an ancient city is discovered by accident](https://www.nbcnews.com/news/harris-speech-ellipse-ancient-mayan-city-morning-rundown-rcna177973)[](https://www.nbcnews.com/news/harris-speech-ellipse-ancient-mayan-city-morning-rundown-rcna177973)\n",
+ "\n",
+ "Skip to Content\n",
+ "\n",
+ "[NBC News Logo](https://www.nbcnews.com)\n",
+ "\n",
+ "Spon\n"
+ ]
+ }
+ ],
+ "source": [
+ "import asyncio\n",
+ "from crawl4ai import AsyncWebCrawler\n",
+ "\n",
+ "async def simple_crawl():\n",
+ " async with AsyncWebCrawler() as crawler:\n",
+ " result = await crawler.arun(\n",
+ " url=\"https://www.nbcnews.com/business\",\n",
+ " bypass_cache=True # By default this is False, meaning the cache will be used\n",
+ " )\n",
+ " print(result.markdown[:500]) # Print the first 500 characters\n",
+ " \n",
+ "asyncio.run(simple_crawl())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "da9b4d50",
+ "metadata": {},
+ "source": [
+ "#### 3. **Dynamic Content Handling**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "5bb8c1e4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
+ "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
+ "[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n",
+ "[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n",
+ "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 4.52 seconds\n",
+ "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.15 seconds\n",
+ "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
+ "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.15 seconds.\n",
+ "IE 11 is not supported. For an optimal experience visit our site on another browser.\n",
+ "\n",
+ "[Morning Rundown: Trump and Harris' vastly different closing pitches, why Kim Jong Un is helping Russia, and an ancient city is discovered by accident](https://www.nbcnews.com/news/harris-speech-ellipse-ancient-mayan-city-morning-rundown-rcna177973)[](https://www.nbcnews.com/news/harris-speech-ellipse-ancient-mayan-city-morning-rundown-rcna177973)\n",
+ "\n",
+ "Skip to Content\n",
+ "\n",
+ "[NBC News Logo](https://www.nbcnews.com)\n",
+ "\n",
+ "Spon\n"
+ ]
+ }
+ ],
+ "source": [
+ "async def crawl_dynamic_content():\n",
+ " # You can use wait_for to wait for a condition to be met before returning the result\n",
+ " # wait_for = \"\"\"() => {\n",
+ " # return Array.from(document.querySelectorAll('article.tease-card')).length > 10;\n",
+ " # }\"\"\"\n",
+ "\n",
+ " # wait_for can be also just a css selector\n",
+ " # wait_for = \"article.tease-card:nth-child(10)\"\n",
+ "\n",
+ " async with AsyncWebCrawler(verbose=True) as crawler:\n",
+ " js_code = [\n",
+ " \"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"\n",
+ " ]\n",
+ " result = await crawler.arun(\n",
+ " url=\"https://www.nbcnews.com/business\",\n",
+ " js_code=js_code,\n",
+ " # wait_for=wait_for,\n",
+ " bypass_cache=True,\n",
+ " )\n",
+ " print(result.markdown[:500]) # Print first 500 characters\n",
+ "\n",
+ "asyncio.run(crawl_dynamic_content())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "86febd8d",
+ "metadata": {},
+ "source": [
+ "#### 4. **Content Cleaning and Fit Markdown**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8e8ab01f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "async def clean_content():\n",
+ " async with AsyncWebCrawler() as crawler:\n",
+ " result = await crawler.arun(\n",
+ " url=\"https://janineintheworld.com/places-to-visit-in-central-mexico\",\n",
+ " excluded_tags=['nav', 'footer', 'aside'],\n",
+ " remove_overlay_elements=True,\n",
+ " word_count_threshold=10,\n",
+ " bypass_cache=True\n",
+ " )\n",
+ " full_markdown_length = len(result.markdown)\n",
+ " fit_markdown_length = len(result.fit_markdown)\n",
+ " print(f\"Full Markdown Length: {full_markdown_length}\")\n",
+ " print(f\"Fit Markdown Length: {fit_markdown_length}\")\n",
+ " print(result.fit_markdown[:1000])\n",
+ " \n",
+ "\n",
+ "asyncio.run(clean_content())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "55715146",
+ "metadata": {},
+ "source": [
+ "#### 5. **Link Analysis and Smart Filtering**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "2ae47c69",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 0.93 seconds\n",
+ "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.11 seconds\n",
+ "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
+ "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.11 seconds.\n",
+ "Found 107 internal links\n",
+ "Found 58 external links\n",
+ "Href: https://www.nbcnews.com/news/harris-speech-ellipse-ancient-mayan-city-morning-rundown-rcna177973\n",
+ "Text: Morning Rundown: Trump and Harris' vastly different closing pitches, why Kim Jong Un is helping Russia, and an ancient city is discovered by accident\n",
+ "\n",
+ "Href: https://www.nbcnews.com\n",
+ "Text: NBC News Logo\n",
+ "\n",
+ "Href: https://www.nbcnews.com/politics/2024-election/live-blog/kamala-harris-donald-trump-rally-election-live-updates-rcna177529\n",
+ "Text: 2024 Election\n",
+ "\n",
+ "Href: https://www.nbcnews.com/politics\n",
+ "Text: Politics\n",
+ "\n",
+ "Href: https://www.nbcnews.com/us-news\n",
+ "Text: U.S. News\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "async def link_analysis():\n",
+ " async with AsyncWebCrawler() as crawler:\n",
+ " result = await crawler.arun(\n",
+ " url=\"https://www.nbcnews.com/business\",\n",
+ " bypass_cache=True,\n",
+ " exclude_external_links=True,\n",
+ " exclude_social_media_links=True,\n",
+ " # exclude_domains=[\"facebook.com\", \"twitter.com\"]\n",
+ " )\n",
+ " print(f\"Found {len(result.links['internal'])} internal links\")\n",
+ " print(f\"Found {len(result.links['external'])} external links\")\n",
+ "\n",
+ " for link in result.links['internal'][:5]:\n",
+ " print(f\"Href: {link['href']}\\nText: {link['text']}\\n\")\n",
+ " \n",
+ "\n",
+ "asyncio.run(link_analysis())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "80cceef3",
+ "metadata": {},
+ "source": [
+ "#### 6. **Media Handling**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "1fed7f99",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 1.42 seconds\n",
+ "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.11 seconds\n",
+ "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
+ "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.12 seconds.\n",
+ "Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-762x508,f_auto,q_auto:best/rockcms/2024-10/241023-NM-Chilccare-jg-27b982.jpg, Alt: , Score: 4\n",
+ "Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-80x80,f_auto,q_auto:best/rockcms/2024-10/241030-china-ev-electric-mb-0746-cae05c.jpg, Alt: Volkswagen Workshop in Hefei, Score: 5\n",
+ "Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-80x80,f_auto,q_auto:best/rockcms/2024-10/241029-nyc-subway-sandwich-2021-ac-922p-a92374.jpg, Alt: A sub is prepared at a Subway restaurant in Manhattan, New York City, Score: 5\n",
+ "Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-80x80,f_auto,q_auto:best/rockcms/2024-10/241029-suv-gravity-ch-1618-752415.jpg, Alt: The Lucid Gravity car., Score: 5\n",
+ "Image URL: https://media-cldnry.s-nbcnews.com/image/upload/t_focal-80x80,f_auto,q_auto:best/rockcms/2024-10/241029-dearborn-michigan-f-150-ford-ranger-trucks-assembly-line-ac-426p-614f0b.jpg, Alt: Ford Introduces new F-150 And Ranger Trucks At Their Dearborn Plant, Score: 5\n"
+ ]
+ }
+ ],
+ "source": [
+ "async def media_handling():\n",
+ " async with AsyncWebCrawler() as crawler:\n",
+ " result = await crawler.arun(\n",
+ " url=\"https://www.nbcnews.com/business\", \n",
+ " bypass_cache=True,\n",
+ " exclude_external_images=False,\n",
+ " screenshot=True\n",
+ " )\n",
+ " for img in result.media['images'][:5]:\n",
+ " print(f\"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}\")\n",
+ " \n",
+ "asyncio.run(media_handling())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9290499a",
+ "metadata": {},
+ "source": [
+ "#### 7. **Using Hooks for Custom Workflow**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9d069c2b",
+ "metadata": {},
+ "source": [
+ "Hooks in Crawl4AI allow you to run custom logic at specific stages of the crawling process. This can be invaluable for scenarios like setting custom headers, logging activities, or processing content before it is returned. Below is an example of a basic workflow using a hook, followed by a complete list of available hooks and explanations on their usage."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "bc4d2fc8",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[Hook] Preparing to navigate...\n",
+ "[LOG] 🚀 Crawling done for https://crawl4ai.com, success: True, time taken: 3.49 seconds\n",
+ "[LOG] 🚀 Content extracted for https://crawl4ai.com, success: True, time taken: 0.03 seconds\n",
+ "[LOG] 🔥 Extracting semantic blocks for https://crawl4ai.com, Strategy: AsyncWebCrawler\n",
+ "[LOG] 🚀 Extraction done for https://crawl4ai.com, time taken: 0.03 seconds.\n",
+ "[Crawl4AI Documentation](https://docs.crawl4ai.com/)\n",
+ "\n",
+ " * [ Home ](.)\n",
+ " * [ Installation ](basic/installation/)\n",
+ " * [ Quick Start ](basic/quickstart/)\n",
+ " * [ Search ](#)\n",
+ "\n",
+ "\n",
+ "\n",
+ " * Home\n",
+ " * [Installation](basic/installation/)\n",
+ " * [Quick Start](basic/quickstart/)\n",
+ " * Basic\n",
+ " * [Simple Crawling](basic/simple-crawling/)\n",
+ " * [Output Formats](basic/output-formats/)\n",
+ " * [Browser Configuration](basic/browser-config/)\n",
+ " * [Page Interaction](basic/page-interaction/)\n",
+ " * [Content Selection](basic/con\n"
+ ]
+ }
+ ],
+ "source": [
+ "async def custom_hook_workflow():\n",
+ " async with AsyncWebCrawler() as crawler:\n",
+ " # Set a 'before_goto' hook to run custom code just before navigation\n",
+ " crawler.crawler_strategy.set_hook(\"before_goto\", lambda page: print(\"[Hook] Preparing to navigate...\"))\n",
+ " \n",
+ " # Perform the crawl operation\n",
+ " result = await crawler.arun(\n",
+ " url=\"https://crawl4ai.com\",\n",
+ " bypass_cache=True\n",
+ " )\n",
+ " print(result.markdown[:500]) # Display the first 500 characters\n",
+ "\n",
+ "asyncio.run(custom_hook_workflow())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3ff45e21",
+ "metadata": {},
+ "source": [
+ "List of available hooks and examples for each stage of the crawling process:\n",
+ "\n",
+ "- **on_browser_created**\n",
+ " ```python\n",
+ " async def on_browser_created_hook(browser):\n",
+ " print(\"[Hook] Browser created\")\n",
+ " ```\n",
+ "\n",
+ "- **before_goto**\n",
+ " ```python\n",
+ " async def before_goto_hook(page):\n",
+ " await page.set_extra_http_headers({\"X-Test-Header\": \"test\"})\n",
+ " ```\n",
+ "\n",
+ "- **after_goto**\n",
+ " ```python\n",
+ " async def after_goto_hook(page):\n",
+ " print(f\"[Hook] Navigated to {page.url}\")\n",
+ " ```\n",
+ "\n",
+ "- **on_execution_started**\n",
+ " ```python\n",
+ " async def on_execution_started_hook(page):\n",
+ " print(\"[Hook] JavaScript execution started\")\n",
+ " ```\n",
+ "\n",
+ "- **before_return_html**\n",
+ " ```python\n",
+ " async def before_return_html_hook(page, html):\n",
+ " print(f\"[Hook] HTML length: {len(html)}\")\n",
+ " ```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2d56ebb1",
+ "metadata": {},
+ "source": [
+ "#### 8. **Session-Based Crawling**\n",
+ "\n",
+ "When to Use Session-Based Crawling: \n",
+ "Session-based crawling is especially beneficial when navigating through multi-page content where each page load needs to maintain the same session context. For instance, in cases where a “Next Page” button must be clicked to load subsequent data, the new data often replaces the previous content. Here, session-based crawling keeps the browser state intact across each interaction, allowing for sequential actions within the same session.\n",
+ "\n",
+ "Example: Multi-Page Navigation Using JavaScript\n",
+ "In this example, we’ll navigate through multiple pages by clicking a \"Next Page\" button. After each page load, we extract the new content and repeat the process."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e7bfebae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "async def multi_page_session_crawl():\n",
+ " async with AsyncWebCrawler() as crawler:\n",
+ " session_id = \"page_navigation_session\"\n",
+ " url = \"https://example.com/paged-content\"\n",
+ "\n",
+ " for page_number in range(1, 4):\n",
+ " result = await crawler.arun(\n",
+ " url=url,\n",
+ " session_id=session_id,\n",
+ " js_code=\"document.querySelector('.next-page-button').click();\" if page_number > 1 else None,\n",
+ " css_selector=\".content-section\",\n",
+ " bypass_cache=True\n",
+ " )\n",
+ " print(f\"Page {page_number} Content:\")\n",
+ " print(result.markdown[:500]) # Print first 500 characters\n",
+ "\n",
+ "# asyncio.run(multi_page_session_crawl())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ad32a778",
+ "metadata": {},
+ "source": [
+ "#### 9. **Using Extraction Strategies**\n",
+ "\n",
+ "**LLM Extraction**\n",
+ "\n",
+ "This example demonstrates how to use language model-based extraction to retrieve structured data from a pricing page on OpenAI’s site."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "3011a7c5",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "--- Extracting Structured Data with openai/gpt-4o-mini ---\n",
+ "[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
+ "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
+ "[LOG] 🕸️ Crawling https://openai.com/api/pricing/ using AsyncPlaywrightCrawlerStrategy...\n",
+ "[LOG] ✅ Crawled https://openai.com/api/pricing/ successfully!\n",
+ "[LOG] 🚀 Crawling done for https://openai.com/api/pricing/, success: True, time taken: 1.29 seconds\n",
+ "[LOG] 🚀 Content extracted for https://openai.com/api/pricing/, success: True, time taken: 0.13 seconds\n",
+ "[LOG] 🔥 Extracting semantic blocks for https://openai.com/api/pricing/, Strategy: AsyncWebCrawler\n",
+ "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 0\n",
+ "[LOG] Extracted 26 blocks from URL: https://openai.com/api/pricing/ block index: 0\n",
+ "[LOG] 🚀 Extraction done for https://openai.com/api/pricing/, time taken: 15.12 seconds.\n",
+ "[{'model_name': 'gpt-4o', 'input_fee': '$2.50 / 1M input tokens', 'output_fee': '$10.00 / 1M output tokens', 'error': False}, {'model_name': 'gpt-4o-2024-08-06', 'input_fee': '$2.50 / 1M input tokens', 'output_fee': '$10.00 / 1M output tokens', 'error': False}, {'model_name': 'gpt-4o-audio-preview', 'input_fee': '$2.50 / 1M input tokens', 'output_fee': '$10.00 / 1M output tokens', 'error': False}, {'model_name': 'gpt-4o-audio-preview-2024-10-01', 'input_fee': '$2.50 / 1M input tokens', 'output_fee': '$10.00 / 1M output tokens', 'error': False}, {'model_name': 'gpt-4o-2024-05-13', 'input_fee': '$5.00 / 1M input tokens', 'output_fee': '$15.00 / 1M output tokens', 'error': False}]\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/unclecode/devs/crawl4ai/venv/lib/python3.10/site-packages/pydantic/main.py:347: UserWarning: Pydantic serializer warnings:\n",
+ " Expected `PromptTokensDetails` but got `dict` - serialized value may not be as expected\n",
+ " return self.__pydantic_serializer__.to_python(\n"
+ ]
+ }
+ ],
+ "source": [
+ "from crawl4ai.extraction_strategy import LLMExtractionStrategy\n",
+ "from pydantic import BaseModel, Field\n",
+ "import os, json\n",
+ "\n",
+ "class OpenAIModelFee(BaseModel):\n",
+ " model_name: str = Field(..., description=\"Name of the OpenAI model.\")\n",
+ " input_fee: str = Field(..., description=\"Fee for input token for the OpenAI model.\")\n",
+ " output_fee: str = Field(\n",
+ " ..., description=\"Fee for output token for the OpenAI model.\"\n",
+ " )\n",
+ "\n",
+ "async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: dict = None):\n",
+ " print(f\"\\n--- Extracting Structured Data with {provider} ---\")\n",
+ " \n",
+ " # Skip if API token is missing (for providers that require it)\n",
+ " if api_token is None and provider != \"ollama\":\n",
+ " print(f\"API token is required for {provider}. Skipping this example.\")\n",
+ " return\n",
+ "\n",
+ " extra_args = {\"extra_headers\": extra_headers} if extra_headers else {}\n",
+ "\n",
+ " async with AsyncWebCrawler(verbose=True) as crawler:\n",
+ " result = await crawler.arun(\n",
+ " url=\"https://openai.com/api/pricing/\",\n",
+ " word_count_threshold=1,\n",
+ " extraction_strategy=LLMExtractionStrategy(\n",
+ " provider=provider,\n",
+ " api_token=api_token,\n",
+ " schema=OpenAIModelFee.schema(),\n",
+ " extraction_type=\"schema\",\n",
+ " instruction=\"\"\"Extract all model names along with fees for input and output tokens.\"\n",
+ " \"{model_name: 'GPT-4', input_fee: 'US$10.00 / 1M tokens', output_fee: 'US$30.00 / 1M tokens'}.\"\"\",\n",
+ " **extra_args\n",
+ " ),\n",
+ " bypass_cache=True,\n",
+ " )\n",
+ " print(json.loads(result.extracted_content)[:5])\n",
+ "\n",
+ "# Usage:\n",
+ "await extract_structured_data_using_llm(\"openai/gpt-4o-mini\", os.getenv(\"OPENAI_API_KEY\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6532db9d",
+ "metadata": {},
+ "source": [
+ "**Cosine Similarity Strategy**\n",
+ "\n",
+ "This strategy uses semantic clustering to extract relevant content based on contextual similarity, which is helpful when extracting related sections from a single topic."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "ec079108",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LOG] Loading Extraction Model for mps device.\n",
+ "[LOG] Loading Multilabel Classifier for mps device.\n",
+ "[LOG] Model loaded sentence-transformers/all-MiniLM-L6-v2, models/reuters, took 5.193778038024902 seconds\n",
+ "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156, success: True, time taken: 1.37 seconds\n",
+ "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156, success: True, time taken: 0.07 seconds\n",
+ "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156, Strategy: AsyncWebCrawler\n",
+ "[LOG] 🚀 Assign tags using mps\n",
+ "[LOG] 🚀 Categorization done in 0.55 seconds\n",
+ "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156, time taken: 6.63 seconds.\n",
+ "[{'index': 1, 'tags': ['news_&_social_concern'], 'content': \"McDonald's 2024 combo: Inflation, a health crisis and a side of politics # McDonald's 2024 combo: Inflation, a health crisis and a side of politics\"}, {'index': 2, 'tags': ['business_&_entrepreneurs', 'news_&_social_concern'], 'content': 'Like many major brands, McDonald’s raked in big profits as the economy reopened from the pandemic. In October 2022, [executives were boasting](https://www.cnbc.com/2022/10/27/mcdonalds-mcd-earnings-q3-2022.html) that they’d been raising prices without crimping traffic, even as competitors began to warn that some customers were closing their wallets after inflation peaked above 9% that summer. Still, the U.S. had repeatedly dodged a much-forecast recession, and [Americans kept spending on nonessentials](https://www.nbcnews.com/business/economy/year-peak-inflation-travel-leisure-mostly-cost-less-rcna92760) like travel and dining out — despite regularly relaying to pollsters their dismal views of an otherwise solid economy. Even so, 64% of consumers said they noticed price increases at quick-service restaurants in September, more than at any other type of venue, according to a survey by Datassential, a food and beverage market researcher. Politicians are still drawing attention to fast-food costs, too, as the election season barrels toward a tumultuous finish. A group of Democratic senators this month [denounced McDonald’s for menu prices](https://www.nbcnews.com/news/us-news/democratic-senators-slam-mcdonalds-menu-price-hikes-rcna176380) that they said outstripped inflation, accusing the company of looking to profit “at the expense of people’s ability to put food on the table.” The financial results come toward the end of a humbling year for the nearly $213 billion restaurant chain, whose shares remained steady on the heels of its latest earnings. Kempczinski [sought to reassure investors](https://www.cnbc.com/2024/10/29/mcdonalds-e-coli-outbreak-ceo-comments.html) that [the E. coli outbreak](https://www.nbcnews.com/health/health-news/illnesses-linked-mcdonalds-e-coli-outbreak-rise-75-cdc-says-rcna177260), linked to Quarter Pounder burgers, was under control after the health crisis temporarily dented the company’s stock and caused U.S. foot traffic to drop nearly 10% in the days afterward, according to estimates by Gordon Haskett financial researchers. The fast-food giant [reported Tuesday](https://www.cnbc.com/2024/10/29/mcdonalds-mcd-earnings-q3-2024.html) that it had reversed its recent U.S. sales drop, posting a 0.3% uptick in the third quarter. Foot traffic was still down slightly, but the company said its summer of discounts was paying off. But by early this year, [photos of eye-watering menu prices](https://x.com/sam_learner/status/1681367351143301129) at some McDonald’s locations — including an $18 Big Mac combo at a Connecticut rest stop from July 2023 — went viral, bringing diners’ long-simmering frustrations to a boiling point that the company couldn’t ignore. On an earnings call in April, Kempczinski acknowledged that foot traffic had fallen. “We will stay laser-focused on providing an unparalleled experience with simple, everyday value and affordability that our consumers can count on as they continue to be mindful about their spending,” CEO Chris Kempczinski [said in a statement](https://www.prnewswire.com/news-releases/mcdonalds-reports-third-quarter-2024-results-302289216.html?Fds-Load-Behavior=force-external) alongside the earnings report.'}, {'index': 3, 'tags': ['food_&_dining', 'news_&_social_concern'], 'content': '![mcdonalds drive-thru economy fast food](https://media-cldnry.s-nbcnews.com/image/upload/t_fit-760w,f_auto,q_auto:best/rockcms/2024-10/241024-los-angeles-mcdonalds-drive-thru-ac-1059p-cfc311.jpg)McDonald’s has had some success leaning into discounts this year. Eric Thayer / Bloomberg via Getty Images file'}, {'index': 4, 'tags': ['business_&_entrepreneurs', 'food_&_dining', 'news_&_social_concern'], 'content': 'McDonald’s has faced a customer revolt over pricey Big Macs, an unsolicited cameo in election-season crossfire, and now an E. coli outbreak — just as the company had been luring customers back with more affordable burgers. Despite a difficult quarter, McDonald’s looks resilient in the face of various pressures, analysts say — something the company shares with U.S. consumers overall. “Consumers continue to be even more discriminating with every dollar that they spend,” he said at the time. Going forward, McDonald’s would be “laser-focused” on affordability. “McDonald’s has also done a good job of embedding the brand in popular culture to enhance its relevance and meaning around fun and family. But it also needed to modify the product line to meet the expectations of a consumer who is on a tight budget,” he said. “The thing that McDonald’s had struggled with, and why I think we’re seeing kind of an inflection point, is a value proposition,” Senatore said. “McDonald’s menu price increases had run ahead of a lot of its restaurant peers. … Consumers are savvy enough to know that.” For many consumers, the fast-food giant’s menus serve as an informal gauge of the economy overall, said Sara Senatore, a Bank of America analyst covering restaurants. “The spotlight is always on McDonald’s because it’s so big” and something of a “bellwether,” she said. McDonald’s didn’t respond to requests for comment.'}, {'index': 5, 'tags': ['business_&_entrepreneurs', 'food_&_dining'], 'content': 'Mickey D’s’ $5 meal deal, which it launched in late June to jumpstart slumping sales, has given the company an appealing price point to advertise nationwide, Senatore said, speculating that it could open the door to a new permanent value offering. But before that promotion rolled out, the company’s reputation as a low-cost option had taken a bruising hit.'}]\n"
+ ]
+ }
+ ],
+ "source": [
+ "from crawl4ai.extraction_strategy import CosineStrategy\n",
+ "\n",
+ "async def cosine_similarity_extraction():\n",
+ " async with AsyncWebCrawler() as crawler:\n",
+ " strategy = CosineStrategy(\n",
+ " word_count_threshold=10,\n",
+ " max_dist=0.2, # Maximum distance between two words\n",
+ " linkage_method=\"ward\", # Linkage method for hierarchical clustering (ward, complete, average, single)\n",
+ " top_k=3, # Number of top keywords to extract\n",
+ " sim_threshold=0.3, # Similarity threshold for clustering\n",
+ " semantic_filter=\"McDonald's economic impact, American consumer trends\", # Keywords to filter the content semantically using embeddings\n",
+ " verbose=True\n",
+ " )\n",
+ " \n",
+ " result = await crawler.arun(\n",
+ " url=\"https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156\",\n",
+ " extraction_strategy=strategy\n",
+ " )\n",
+ " print(json.loads(result.extracted_content)[:5])\n",
+ "\n",
+ "asyncio.run(cosine_similarity_extraction())\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ff423629",
+ "metadata": {},
+ "source": [
+ "#### 10. **Conclusion and Next Steps**\n",
+ "\n",
+ "You’ve explored core features of Crawl4AI, including dynamic content handling, link analysis, and advanced extraction strategies. Visit our documentation for further details on using Crawl4AI’s extensive features.\n",
+ "\n",
+ "- GitHub Repository: [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)\n",
+ "- Twitter: [@unclecode](https://twitter.com/unclecode)\n",
+ "- Website: [https://crawl4ai.com](https://crawl4ai.com)\n",
+ "\n",
+ "Happy Crawling with Crawl4AI! 🕷️🤖\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d34c1d35",
+ "metadata": {},
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart_async.config.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c4a9d8643becc9af0d734b7cec15ddf1c11c2de
--- /dev/null
+++ b/docs/examples/quickstart_async.config.py
@@ -0,0 +1,610 @@
+import os, sys
+
+sys.path.append(
+ os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
+
+import asyncio
+import time
+import json
+import re
+from typing import Dict, List
+from bs4 import BeautifulSoup
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
+from crawl4ai.extraction_strategy import (
+ JsonCssExtractionStrategy,
+ LLMExtractionStrategy,
+)
+
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+print("Crawl4AI: Advanced Web Crawling and Data Extraction")
+print("GitHub Repository: https://github.com/unclecode/crawl4ai")
+print("Twitter: @unclecode")
+print("Website: https://crawl4ai.com")
+
+
+# Basic Example - Simple Crawl
+async def simple_crawl():
+ print("\n--- Basic Usage ---")
+ browser_config = BrowserConfig(headless=True)
+ crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business", config=crawler_config
+ )
+ print(result.markdown[:500])
+
+
+async def clean_content():
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ excluded_tags=["nav", "footer", "aside"],
+ remove_overlay_elements=True,
+ markdown_generator=DefaultMarkdownGenerator(
+ content_filter=PruningContentFilter(
+ threshold=0.48, threshold_type="fixed", min_word_threshold=0
+ ),
+ options={"ignore_links": True},
+ ),
+ )
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="https://en.wikipedia.org/wiki/Apple",
+ config=crawler_config,
+ )
+ full_markdown_length = len(result.markdown_v2.raw_markdown)
+ fit_markdown_length = len(result.markdown_v2.fit_markdown)
+ print(f"Full Markdown Length: {full_markdown_length}")
+ print(f"Fit Markdown Length: {fit_markdown_length}")
+
+async def link_analysis():
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.ENABLED,
+ exclude_external_links=True,
+ exclude_social_media_links=True,
+ )
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business",
+ config=crawler_config,
+ )
+ print(f"Found {len(result.links['internal'])} internal links")
+ print(f"Found {len(result.links['external'])} external links")
+
+ for link in result.links['internal'][:5]:
+ print(f"Href: {link['href']}\nText: {link['text']}\n")
+
+# JavaScript Execution Example
+async def simple_example_with_running_js_code():
+ print("\n--- Executing JavaScript and Using CSS Selectors ---")
+
+ browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
+ # wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business", config=crawler_config
+ )
+ print(result.markdown[:500])
+
+
+# CSS Selector Example
+async def simple_example_with_css_selector():
+ print("\n--- Using CSS Selectors ---")
+ browser_config = BrowserConfig(headless=True)
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business", config=crawler_config
+ )
+ print(result.markdown[:500])
+
+async def media_handling():
+ crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True)
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business",
+ config=crawler_config
+ )
+ for img in result.media['images'][:5]:
+ print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
+
+async def custom_hook_workflow(verbose=True):
+ async with AsyncWebCrawler() as crawler:
+ # Set a 'before_goto' hook to run custom code just before navigation
+ crawler.crawler_strategy.set_hook("before_goto", lambda page, context: print("[Hook] Preparing to navigate..."))
+
+ # Perform the crawl operation
+ result = await crawler.arun(
+ url="https://crawl4ai.com"
+ )
+ print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- "))
+
+
+# Proxy Example
+async def use_proxy():
+ print("\n--- Using a Proxy ---")
+ browser_config = BrowserConfig(
+ headless=True,
+ proxy_config={
+ "server": "http://proxy.example.com:8080",
+ "username": "username",
+ "password": "password",
+ },
+ )
+ crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business", config=crawler_config
+ )
+ if result.success:
+ print(result.markdown[:500])
+
+
+# Screenshot Example
+async def capture_and_save_screenshot(url: str, output_path: str):
+ browser_config = BrowserConfig(headless=True)
+ crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url=url, config=crawler_config)
+
+ if result.success and result.screenshot:
+ import base64
+
+ screenshot_data = base64.b64decode(result.screenshot)
+ with open(output_path, "wb") as f:
+ f.write(screenshot_data)
+ print(f"Screenshot saved successfully to {output_path}")
+ else:
+ print("Failed to capture screenshot")
+
+
+# LLM Extraction Example
+class OpenAIModelFee(BaseModel):
+ model_name: str = Field(..., description="Name of the OpenAI model.")
+ input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+ output_fee: str = Field(
+ ..., description="Fee for output token for the OpenAI model."
+ )
+
+
+async def extract_structured_data_using_llm(
+ provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
+ print(f"\n--- Extracting Structured Data with {provider} ---")
+
+ if api_token is None and provider != "ollama":
+ print(f"API token is required for {provider}. Skipping this example.")
+ return
+
+ browser_config = BrowserConfig(headless=True)
+
+ extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
+ if extra_headers:
+ extra_args["extra_headers"] = extra_headers
+
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ word_count_threshold=1,
+ page_timeout=80000,
+ extraction_strategy=LLMExtractionStrategy(
+ provider=provider,
+ api_token=api_token,
+ schema=OpenAIModelFee.model_json_schema(),
+ extraction_type="schema",
+ instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
+ Do not miss any models in the entire content.""",
+ extra_args=extra_args,
+ ),
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ url="https://openai.com/api/pricing/", config=crawler_config
+ )
+ print(result.extracted_content)
+
+
+# CSS Extraction Example
+async def extract_structured_data_using_css_extractor():
+ print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
+ schema = {
+ "name": "KidoCode Courses",
+ "baseSelector": "section.charge-methodology .w-tab-content > div",
+ "fields": [
+ {
+ "name": "section_title",
+ "selector": "h3.heading-50",
+ "type": "text",
+ },
+ {
+ "name": "section_description",
+ "selector": ".charge-content",
+ "type": "text",
+ },
+ {
+ "name": "course_name",
+ "selector": ".text-block-93",
+ "type": "text",
+ },
+ {
+ "name": "course_description",
+ "selector": ".course-content-text",
+ "type": "text",
+ },
+ {
+ "name": "course_icon",
+ "selector": ".image-92",
+ "type": "attribute",
+ "attribute": "src",
+ },
+ ],
+ }
+
+ browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+ js_click_tabs = """
+ (async () => {
+ const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+ for(let tab of tabs) {
+ tab.scrollIntoView();
+ tab.click();
+ await new Promise(r => setTimeout(r, 500));
+ }
+ })();
+ """
+
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ extraction_strategy=JsonCssExtractionStrategy(schema),
+ js_code=[js_click_tabs],
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ url="https://www.kidocode.com/degrees/technology", config=crawler_config
+ )
+
+ companies = json.loads(result.extracted_content)
+ print(f"Successfully extracted {len(companies)} companies")
+ print(json.dumps(companies[0], indent=2))
+
+
+# Dynamic Content Examples - Method 1
+async def crawl_dynamic_content_pages_method_1():
+ print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+ first_commit = ""
+
+ async def on_execution_started(page, **kwargs):
+ nonlocal first_commit
+ try:
+ while True:
+ await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
+ commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
+ commit = await commit.evaluate("(element) => element.textContent")
+ commit = re.sub(r"\s+", "", commit)
+ if commit and commit != first_commit:
+ first_commit = commit
+ break
+ await asyncio.sleep(0.5)
+ except Exception as e:
+ print(f"Warning: New content didn't appear after JavaScript execution: {e}")
+
+ browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+
+ url = "https://github.com/microsoft/TypeScript/commits/main"
+ session_id = "typescript_commits_session"
+ all_commits = []
+
+ js_next_page = """
+ const button = document.querySelector('a[data-testid="pagination-next-button"]');
+ if (button) button.click();
+ """
+
+ for page in range(3):
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ css_selector="li.Box-sc-g0xbh4-0",
+ js_code=js_next_page if page > 0 else None,
+ js_only=page > 0,
+ session_id=session_id,
+ )
+
+ result = await crawler.arun(url=url, config=crawler_config)
+ assert result.success, f"Failed to crawl page {page + 1}"
+
+ soup = BeautifulSoup(result.cleaned_html, "html.parser")
+ commits = soup.select("li")
+ all_commits.extend(commits)
+
+ print(f"Page {page + 1}: Found {len(commits)} commits")
+
+ print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+# Dynamic Content Examples - Method 2
+async def crawl_dynamic_content_pages_method_2():
+ print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+
+ browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+ js_next_page_and_wait = """
+ (async () => {
+ const getCurrentCommit = () => {
+ const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+ return commits.length > 0 ? commits[0].textContent.trim() : null;
+ };
+
+ const initialCommit = getCurrentCommit();
+ const button = document.querySelector('a[data-testid="pagination-next-button"]');
+ if (button) button.click();
+
+ while (true) {
+ await new Promise(resolve => setTimeout(resolve, 100));
+ const newCommit = getCurrentCommit();
+ if (newCommit && newCommit !== initialCommit) {
+ break;
+ }
+ }
+ })();
+ """
+
+ schema = {
+ "name": "Commit Extractor",
+ "baseSelector": "li.Box-sc-g0xbh4-0",
+ "fields": [
+ {
+ "name": "title",
+ "selector": "h4.markdown-title",
+ "type": "text",
+ "transform": "strip",
+ },
+ ],
+ }
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ url = "https://github.com/microsoft/TypeScript/commits/main"
+ session_id = "typescript_commits_session"
+ all_commits = []
+
+ extraction_strategy = JsonCssExtractionStrategy(schema)
+
+ for page in range(3):
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ css_selector="li.Box-sc-g0xbh4-0",
+ extraction_strategy=extraction_strategy,
+ js_code=js_next_page_and_wait if page > 0 else None,
+ js_only=page > 0,
+ session_id=session_id,
+ )
+
+ result = await crawler.arun(url=url, config=crawler_config)
+ assert result.success, f"Failed to crawl page {page + 1}"
+
+ commits = json.loads(result.extracted_content)
+ all_commits.extend(commits)
+ print(f"Page {page + 1}: Found {len(commits)} commits")
+
+ print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+async def cosine_similarity_extraction():
+ crawl_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ extraction_strategy=CosineStrategy(
+ word_count_threshold=10,
+ max_dist=0.2, # Maximum distance between two words
+ linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
+ top_k=3, # Number of top keywords to extract
+ sim_threshold=0.3, # Similarity threshold for clustering
+ semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
+ verbose=True
+ ),
+ )
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
+ config=crawl_config
+ )
+ print(json.loads(result.extracted_content)[:5])
+
+# Browser Comparison
+async def crawl_custom_browser_type():
+ print("\n--- Browser Comparison ---")
+
+ # Firefox
+ browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
+ start = time.time()
+ async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
+ result = await crawler.arun(
+ url="https://www.example.com",
+ config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+ )
+ print("Firefox:", time.time() - start)
+ print(result.markdown[:500])
+
+ # WebKit
+ browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
+ start = time.time()
+ async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
+ result = await crawler.arun(
+ url="https://www.example.com",
+ config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+ )
+ print("WebKit:", time.time() - start)
+ print(result.markdown[:500])
+
+ # Chromium (default)
+ browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
+ start = time.time()
+ async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
+ result = await crawler.arun(
+ url="https://www.example.com",
+ config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+ )
+ print("Chromium:", time.time() - start)
+ print(result.markdown[:500])
+
+
+# Anti-Bot and User Simulation
+async def crawl_with_user_simulation():
+ browser_config = BrowserConfig(
+ headless=True,
+ user_agent_mode="random",
+ user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
+ )
+
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ magic=True,
+ simulate_user=True,
+ override_navigator=True,
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
+ print(result.markdown)
+
+async def ssl_certification():
+ # Configure crawler to fetch SSL certificate
+ config = CrawlerRunConfig(
+ fetch_ssl_certificate=True,
+ cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url='https://example.com',
+ config=config
+ )
+
+ if result.success and result.ssl_certificate:
+ cert = result.ssl_certificate
+
+ # 1. Access certificate properties directly
+ print("\nCertificate Information:")
+ print(f"Issuer: {cert.issuer.get('CN', '')}")
+ print(f"Valid until: {cert.valid_until}")
+ print(f"Fingerprint: {cert.fingerprint}")
+
+ # 2. Export certificate in different formats
+ cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
+ print("\nCertificate exported to:")
+ print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
+
+ pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers
+ print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
+
+ der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps
+ print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
+
+# Speed Comparison
+async def speed_comparison():
+ print("\n--- Speed Comparison ---")
+
+ # Firecrawl comparison
+ from firecrawl import FirecrawlApp
+
+ app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
+ start = time.time()
+ scrape_status = app.scrape_url(
+ "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
+ )
+ end = time.time()
+ print("Firecrawl:")
+ print(f"Time taken: {end - start:.2f} seconds")
+ print(f"Content length: {len(scrape_status['markdown'])} characters")
+ print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
+ print()
+
+ # Crawl4AI comparisons
+ browser_config = BrowserConfig(headless=True)
+
+ # Simple crawl
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ start = time.time()
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business",
+ config=CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS, word_count_threshold=0
+ ),
+ )
+ end = time.time()
+ print("Crawl4AI (simple crawl):")
+ print(f"Time taken: {end - start:.2f} seconds")
+ print(f"Content length: {len(result.markdown)} characters")
+ print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
+ print()
+
+ # Advanced filtering
+ start = time.time()
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business",
+ config=CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ word_count_threshold=0,
+ markdown_generator=DefaultMarkdownGenerator(
+ content_filter=PruningContentFilter(
+ threshold=0.48, threshold_type="fixed", min_word_threshold=0
+ )
+ ),
+ ),
+ )
+ end = time.time()
+ print("Crawl4AI (Markdown Plus):")
+ print(f"Time taken: {end - start:.2f} seconds")
+ print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
+ print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
+ print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
+ print()
+
+
+# Main execution
+async def main():
+ # Basic examples
+ # await simple_crawl()
+ # await simple_example_with_running_js_code()
+ # await simple_example_with_css_selector()
+
+ # Advanced examples
+ # await extract_structured_data_using_css_extractor()
+ await extract_structured_data_using_llm(
+ "openai/gpt-4o", os.getenv("OPENAI_API_KEY")
+ )
+ # await crawl_dynamic_content_pages_method_1()
+ # await crawl_dynamic_content_pages_method_2()
+
+ # Browser comparisons
+ # await crawl_custom_browser_type()
+
+ # Performance testing
+ # await speed_comparison()
+
+ # Screenshot example
+ # await capture_and_save_screenshot(
+ # "https://www.example.com",
+ # os.path.join(__location__, "tmp/example_screenshot.jpg")
+ # )
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py
new file mode 100644
index 0000000000000000000000000000000000000000..e640e6bd2843ecd7aa650eb97ce29e8410961ada
--- /dev/null
+++ b/docs/examples/quickstart_async.py
@@ -0,0 +1,640 @@
+import os, sys
+# append parent directory to system path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))); os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692";
+
+import asyncio
+# import nest_asyncio
+# nest_asyncio.apply()
+
+import time
+import json
+import os
+import re
+from typing import Dict, List
+from bs4 import BeautifulSoup
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
+from crawl4ai.extraction_strategy import (
+ JsonCssExtractionStrategy,
+ LLMExtractionStrategy,
+)
+
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+print("Crawl4AI: Advanced Web Crawling and Data Extraction")
+print("GitHub Repository: https://github.com/unclecode/crawl4ai")
+print("Twitter: @unclecode")
+print("Website: https://crawl4ai.com")
+
+
+async def simple_crawl():
+ print("\n--- Basic Usage ---")
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS)
+ print(result.markdown[:500]) # Print first 500 characters
+
+async def simple_example_with_running_js_code():
+ print("\n--- Executing JavaScript and Using CSS Selectors ---")
+ # New code to handle the wait_for parameter
+ wait_for = """() => {
+ return Array.from(document.querySelectorAll('article.tease-card')).length > 10;
+ }"""
+
+ # wait_for can be also just a css selector
+ # wait_for = "article.tease-card:nth-child(10)"
+
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ js_code = [
+ "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
+ ]
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business",
+ js_code=js_code,
+ # wait_for=wait_for,
+ cache_mode=CacheMode.BYPASS,
+ )
+ print(result.markdown[:500]) # Print first 500 characters
+
+async def simple_example_with_css_selector():
+ print("\n--- Using CSS Selectors ---")
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business",
+ css_selector=".wide-tease-item__description",
+ cache_mode=CacheMode.BYPASS,
+ )
+ print(result.markdown[:500]) # Print first 500 characters
+
+async def use_proxy():
+ print("\n--- Using a Proxy ---")
+ print(
+ "Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example."
+ )
+ # Uncomment and modify the following lines to use a proxy
+ async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business",
+ cache_mode= CacheMode.BYPASS
+ )
+ if result.success:
+ print(result.markdown[:500]) # Print first 500 characters
+
+async def capture_and_save_screenshot(url: str, output_path: str):
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ result = await crawler.arun(
+ url=url,
+ screenshot=True,
+ cache_mode= CacheMode.BYPASS
+ )
+
+ if result.success and result.screenshot:
+ import base64
+
+ # Decode the base64 screenshot data
+ screenshot_data = base64.b64decode(result.screenshot)
+
+ # Save the screenshot as a JPEG file
+ with open(output_path, 'wb') as f:
+ f.write(screenshot_data)
+
+ print(f"Screenshot saved successfully to {output_path}")
+ else:
+ print("Failed to capture screenshot")
+
+class OpenAIModelFee(BaseModel):
+ model_name: str = Field(..., description="Name of the OpenAI model.")
+ input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+ output_fee: str = Field(
+ ..., description="Fee for output token for the OpenAI model."
+ )
+
+async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
+ print(f"\n--- Extracting Structured Data with {provider} ---")
+
+ if api_token is None and provider != "ollama":
+ print(f"API token is required for {provider}. Skipping this example.")
+ return
+
+ # extra_args = {}
+ extra_args={
+ "temperature": 0,
+ "top_p": 0.9,
+ "max_tokens": 2000,
+ # any other supported parameters for litellm
+ }
+ if extra_headers:
+ extra_args["extra_headers"] = extra_headers
+
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ result = await crawler.arun(
+ url="https://openai.com/api/pricing/",
+ word_count_threshold=1,
+ extraction_strategy=LLMExtractionStrategy(
+ provider=provider,
+ api_token=api_token,
+ schema=OpenAIModelFee.model_json_schema(),
+ extraction_type="schema",
+ instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
+ Do not miss any models in the entire content. One extracted model JSON format should look like this:
+ {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
+ extra_args=extra_args
+ ),
+ cache_mode=CacheMode.BYPASS,
+ )
+ print(result.extracted_content)
+
+async def extract_structured_data_using_css_extractor():
+ print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
+ schema = {
+ "name": "KidoCode Courses",
+ "baseSelector": "section.charge-methodology .w-tab-content > div",
+ "fields": [
+ {
+ "name": "section_title",
+ "selector": "h3.heading-50",
+ "type": "text",
+ },
+ {
+ "name": "section_description",
+ "selector": ".charge-content",
+ "type": "text",
+ },
+ {
+ "name": "course_name",
+ "selector": ".text-block-93",
+ "type": "text",
+ },
+ {
+ "name": "course_description",
+ "selector": ".course-content-text",
+ "type": "text",
+ },
+ {
+ "name": "course_icon",
+ "selector": ".image-92",
+ "type": "attribute",
+ "attribute": "src"
+ }
+ ]
+}
+
+ async with AsyncWebCrawler(
+ headless=True,
+ verbose=True
+ ) as crawler:
+
+ # Create the JavaScript that handles clicking multiple times
+ js_click_tabs = """
+ (async () => {
+ const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+
+ for(let tab of tabs) {
+ // scroll to the tab
+ tab.scrollIntoView();
+ tab.click();
+ // Wait for content to load and animations to complete
+ await new Promise(r => setTimeout(r, 500));
+ }
+ })();
+ """
+
+ result = await crawler.arun(
+ url="https://www.kidocode.com/degrees/technology",
+ extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
+ js_code=[js_click_tabs],
+ cache_mode=CacheMode.BYPASS
+ )
+
+ companies = json.loads(result.extracted_content)
+ print(f"Successfully extracted {len(companies)} companies")
+ print(json.dumps(companies[0], indent=2))
+
+# Advanced Session-Based Crawling with Dynamic Content 🔄
+async def crawl_dynamic_content_pages_method_1():
+ print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+ first_commit = ""
+
+ async def on_execution_started(page):
+ nonlocal first_commit
+ try:
+ while True:
+ await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
+ commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
+ commit = await commit.evaluate("(element) => element.textContent")
+ commit = re.sub(r"\s+", "", commit)
+ if commit and commit != first_commit:
+ first_commit = commit
+ break
+ await asyncio.sleep(0.5)
+ except Exception as e:
+ print(f"Warning: New content didn't appear after JavaScript execution: {e}")
+
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+
+ url = "https://github.com/microsoft/TypeScript/commits/main"
+ session_id = "typescript_commits_session"
+ all_commits = []
+
+ js_next_page = """
+ (() => {
+ const button = document.querySelector('a[data-testid="pagination-next-button"]');
+ if (button) button.click();
+ })();
+ """
+
+ for page in range(3): # Crawl 3 pages
+ result = await crawler.arun(
+ url=url,
+ session_id=session_id,
+ css_selector="li.Box-sc-g0xbh4-0",
+ js=js_next_page if page > 0 else None,
+ cache_mode=CacheMode.BYPASS,
+ js_only=page > 0,
+ headless=False,
+ )
+
+ assert result.success, f"Failed to crawl page {page + 1}"
+
+ soup = BeautifulSoup(result.cleaned_html, "html.parser")
+ commits = soup.select("li")
+ all_commits.extend(commits)
+
+ print(f"Page {page + 1}: Found {len(commits)} commits")
+
+ await crawler.crawler_strategy.kill_session(session_id)
+ print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+async def crawl_dynamic_content_pages_method_2():
+ print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://github.com/microsoft/TypeScript/commits/main"
+ session_id = "typescript_commits_session"
+ all_commits = []
+ last_commit = ""
+
+ js_next_page_and_wait = """
+ (async () => {
+ const getCurrentCommit = () => {
+ const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+ return commits.length > 0 ? commits[0].textContent.trim() : null;
+ };
+
+ const initialCommit = getCurrentCommit();
+ const button = document.querySelector('a[data-testid="pagination-next-button"]');
+ if (button) button.click();
+
+ // Poll for changes
+ while (true) {
+ await new Promise(resolve => setTimeout(resolve, 100)); // Wait 100ms
+ const newCommit = getCurrentCommit();
+ if (newCommit && newCommit !== initialCommit) {
+ break;
+ }
+ }
+ })();
+ """
+
+ schema = {
+ "name": "Commit Extractor",
+ "baseSelector": "li.Box-sc-g0xbh4-0",
+ "fields": [
+ {
+ "name": "title",
+ "selector": "h4.markdown-title",
+ "type": "text",
+ "transform": "strip",
+ },
+ ],
+ }
+ extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
+
+ for page in range(3): # Crawl 3 pages
+ result = await crawler.arun(
+ url=url,
+ session_id=session_id,
+ css_selector="li.Box-sc-g0xbh4-0",
+ extraction_strategy=extraction_strategy,
+ js_code=js_next_page_and_wait if page > 0 else None,
+ js_only=page > 0,
+ cache_mode=CacheMode.BYPASS,
+ headless=False,
+ )
+
+ assert result.success, f"Failed to crawl page {page + 1}"
+
+ commits = json.loads(result.extracted_content)
+ all_commits.extend(commits)
+
+ print(f"Page {page + 1}: Found {len(commits)} commits")
+
+ await crawler.crawler_strategy.kill_session(session_id)
+ print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+async def crawl_dynamic_content_pages_method_3():
+ print("\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---")
+
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://github.com/microsoft/TypeScript/commits/main"
+ session_id = "typescript_commits_session"
+ all_commits = []
+
+ js_next_page = """
+ const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+ if (commits.length > 0) {
+ window.firstCommit = commits[0].textContent.trim();
+ }
+ const button = document.querySelector('a[data-testid="pagination-next-button"]');
+ if (button) button.click();
+ """
+
+ wait_for = """() => {
+ const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+ if (commits.length === 0) return false;
+ const firstCommit = commits[0].textContent.trim();
+ return firstCommit !== window.firstCommit;
+ }"""
+
+ schema = {
+ "name": "Commit Extractor",
+ "baseSelector": "li.Box-sc-g0xbh4-0",
+ "fields": [
+ {
+ "name": "title",
+ "selector": "h4.markdown-title",
+ "type": "text",
+ "transform": "strip",
+ },
+ ],
+ }
+ extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
+
+ for page in range(3): # Crawl 3 pages
+ result = await crawler.arun(
+ url=url,
+ session_id=session_id,
+ css_selector="li.Box-sc-g0xbh4-0",
+ extraction_strategy=extraction_strategy,
+ js_code=js_next_page if page > 0 else None,
+ wait_for=wait_for if page > 0 else None,
+ js_only=page > 0,
+ cache_mode=CacheMode.BYPASS,
+ headless=False,
+ )
+
+ assert result.success, f"Failed to crawl page {page + 1}"
+
+ commits = json.loads(result.extracted_content)
+ all_commits.extend(commits)
+
+ print(f"Page {page + 1}: Found {len(commits)} commits")
+
+ await crawler.crawler_strategy.kill_session(session_id)
+ print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+async def crawl_custom_browser_type():
+ # Use Firefox
+ start = time.time()
+ async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler:
+ result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
+ print(result.markdown[:500])
+ print("Time taken: ", time.time() - start)
+
+ # Use WebKit
+ start = time.time()
+ async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler:
+ result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
+ print(result.markdown[:500])
+ print("Time taken: ", time.time() - start)
+
+ # Use Chromium (default)
+ start = time.time()
+ async with AsyncWebCrawler(verbose=True, headless = True) as crawler:
+ result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
+ print(result.markdown[:500])
+ print("Time taken: ", time.time() - start)
+
+async def crawl_with_user_simultion():
+ async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
+ url = "YOUR-URL-HERE"
+ result = await crawler.arun(
+ url=url,
+ cache_mode=CacheMode.BYPASS,
+ magic = True, # Automatically detects and removes overlays, popups, and other elements that block content
+ # simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
+ # override_navigator = True # Overrides the navigator object to make it look like a real user
+ )
+
+ print(result.markdown)
+
+async def speed_comparison():
+ # print("\n--- Speed Comparison ---")
+ # print("Firecrawl (simulated):")
+ # print("Time taken: 7.02 seconds")
+ # print("Content length: 42074 characters")
+ # print("Images found: 49")
+ # print()
+ # Simulated Firecrawl performance
+ from firecrawl import FirecrawlApp
+ app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
+ start = time.time()
+ scrape_status = app.scrape_url(
+ 'https://www.nbcnews.com/business',
+ params={'formats': ['markdown', 'html']}
+ )
+ end = time.time()
+ print("Firecrawl:")
+ print(f"Time taken: {end - start:.2f} seconds")
+ print(f"Content length: {len(scrape_status['markdown'])} characters")
+ print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
+ print()
+
+ async with AsyncWebCrawler() as crawler:
+ # Crawl4AI simple crawl
+ start = time.time()
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business",
+ word_count_threshold=0,
+ cache_mode=CacheMode.BYPASS,
+ verbose=False,
+ )
+ end = time.time()
+ print("Crawl4AI (simple crawl):")
+ print(f"Time taken: {end - start:.2f} seconds")
+ print(f"Content length: {len(result.markdown)} characters")
+ print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
+ print()
+
+ # Crawl4AI with advanced content filtering
+ start = time.time()
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business",
+ word_count_threshold=0,
+ markdown_generator=DefaultMarkdownGenerator(
+ content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
+ # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
+ ),
+ cache_mode=CacheMode.BYPASS,
+ verbose=False,
+ )
+ end = time.time()
+ print("Crawl4AI (Markdown Plus):")
+ print(f"Time taken: {end - start:.2f} seconds")
+ print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
+ print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
+ print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
+ print()
+
+ # Crawl4AI with JavaScript execution
+ start = time.time()
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business",
+ js_code=[
+ "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
+ ],
+ word_count_threshold=0,
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(
+ content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
+ # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
+ ),
+ verbose=False,
+ )
+ end = time.time()
+ print("Crawl4AI (with JavaScript execution):")
+ print(f"Time taken: {end - start:.2f} seconds")
+ print(f"Content length: {len(result.markdown)} characters")
+ print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
+ print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
+
+ print("\nNote on Speed Comparison:")
+ print("The speed test conducted here may not reflect optimal conditions.")
+ print("When we call Firecrawl's API, we're seeing its best performance,")
+ print("while Crawl4AI's performance is limited by the local network speed.")
+ print("For a more accurate comparison, it's recommended to run these tests")
+ print("on servers with a stable and fast internet connection.")
+ print("Despite these limitations, Crawl4AI still demonstrates faster performance.")
+ print("If you run these tests in an environment with better network conditions,")
+ print("you may observe an even more significant speed advantage for Crawl4AI.")
+
+async def generate_knowledge_graph():
+ class Entity(BaseModel):
+ name: str
+ description: str
+
+ class Relationship(BaseModel):
+ entity1: Entity
+ entity2: Entity
+ description: str
+ relation_type: str
+
+ class KnowledgeGraph(BaseModel):
+ entities: List[Entity]
+ relationships: List[Relationship]
+
+ extraction_strategy = LLMExtractionStrategy(
+ provider='openai/gpt-4o-mini', # Or any other provider, including Ollama and open source models
+ api_token=os.getenv('OPENAI_API_KEY'), # In case of Ollama just pass "no-token"
+ schema=KnowledgeGraph.model_json_schema(),
+ extraction_type="schema",
+ instruction="""Extract entities and relationships from the given text."""
+ )
+ async with AsyncWebCrawler() as crawler:
+ url = "https://paulgraham.com/love.html"
+ result = await crawler.arun(
+ url=url,
+ cache_mode=CacheMode.BYPASS,
+ extraction_strategy=extraction_strategy,
+ # magic=True
+ )
+ # print(result.extracted_content)
+ with open(os.path.join(__location__, "kb.json"), "w") as f:
+ f.write(result.extracted_content)
+
+async def fit_markdown_remove_overlay():
+
+ async with AsyncWebCrawler(
+ headless=True, # Set to False to see what is happening
+ verbose=True,
+ user_agent_mode="random",
+ user_agent_generator_config={
+ "device_type": "mobile",
+ "os_type": "android"
+ },
+ ) as crawler:
+ result = await crawler.arun(
+ url='https://www.kidocode.com/degrees/technology',
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(
+ content_filter=PruningContentFilter(
+ threshold=0.48, threshold_type="fixed", min_word_threshold=0
+ ),
+ options={
+ "ignore_links": True
+ }
+ ),
+ # markdown_generator=DefaultMarkdownGenerator(
+ # content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
+ # options={
+ # "ignore_links": True
+ # }
+ # ),
+ )
+
+ if result.success:
+ print(len(result.markdown_v2.raw_markdown))
+ print(len(result.markdown_v2.markdown_with_citations))
+ print(len(result.markdown_v2.fit_markdown))
+
+ # Save clean html
+ with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
+ f.write(result.cleaned_html)
+
+ with open(os.path.join(__location__, "output/output_raw_markdown.md"), "w") as f:
+ f.write(result.markdown_v2.raw_markdown)
+
+ with open(os.path.join(__location__, "output/output_markdown_with_citations.md"), "w") as f:
+ f.write(result.markdown_v2.markdown_with_citations)
+
+ with open(os.path.join(__location__, "output/output_fit_markdown.md"), "w") as f:
+ f.write(result.markdown_v2.fit_markdown)
+
+ print("Done")
+
+
+async def main():
+ # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+
+ # await simple_crawl()
+ # await simple_example_with_running_js_code()
+ # await simple_example_with_css_selector()
+ # # await use_proxy()
+ # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
+ # await extract_structured_data_using_css_extractor()
+
+ # LLM extraction examples
+ # await extract_structured_data_using_llm()
+ # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
+ # await extract_structured_data_using_llm("ollama/llama3.2")
+
+ # You always can pass custom headers to the extraction strategy
+ # custom_headers = {
+ # "Authorization": "Bearer your-custom-token",
+ # "X-Custom-Header": "Some-Value"
+ # }
+ # await extract_structured_data_using_llm(extra_headers=custom_headers)
+
+ # await crawl_dynamic_content_pages_method_1()
+ # await crawl_dynamic_content_pages_method_2()
+ await crawl_dynamic_content_pages_method_3()
+
+ # await crawl_custom_browser_type()
+
+ # await speed_comparison()
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/docs/examples/quickstart_sync.py b/docs/examples/quickstart_sync.py
new file mode 100644
index 0000000000000000000000000000000000000000..89c631397595851b4e923e3e15e962804c83af42
--- /dev/null
+++ b/docs/examples/quickstart_sync.py
@@ -0,0 +1,312 @@
+import os
+import time
+from crawl4ai.web_crawler import WebCrawler
+from crawl4ai.chunking_strategy import *
+from crawl4ai.extraction_strategy import *
+from crawl4ai.crawler_strategy import *
+from rich import print
+from rich.console import Console
+from functools import lru_cache
+
+console = Console()
+
+@lru_cache()
+def create_crawler():
+ crawler = WebCrawler(verbose=True)
+ crawler.warmup()
+ return crawler
+
+def print_result(result):
+ # Print each key in one line and just the first 10 characters of each one's value and three dots
+ console.print(f"\t[bold]Result:[/bold]")
+ for key, value in result.model_dump().items():
+ if isinstance(value, str) and value:
+ console.print(f"\t{key}: [green]{value[:20]}...[/green]")
+ if result.extracted_content:
+ items = json.loads(result.extracted_content)
+ print(f"\t[bold]{len(items)} blocks is extracted![/bold]")
+
+
+def cprint(message, press_any_key=False):
+ console.print(message)
+ if press_any_key:
+ console.print("Press any key to continue...", style="")
+ input()
+
+def basic_usage(crawler):
+ cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
+ result = crawler.run(url="https://www.nbcnews.com/business", only_text = True)
+ cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
+ print_result(result)
+
+def basic_usage_some_params(crawler):
+ cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
+ result = crawler.run(url="https://www.nbcnews.com/business", word_count_threshold=1, only_text = True)
+ cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
+ print_result(result)
+
+def screenshot_usage(crawler):
+ cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
+ result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
+ cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
+ # Save the screenshot to a file
+ with open("screenshot.png", "wb") as f:
+ f.write(base64.b64decode(result.screenshot))
+ cprint("Screenshot saved to 'screenshot.png'!")
+ print_result(result)
+
+def understanding_parameters(crawler):
+ cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
+ cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
+
+ # First crawl (reads from cache)
+ cprint("1️⃣ First crawl (caches the result):", True)
+ start_time = time.time()
+ result = crawler.run(url="https://www.nbcnews.com/business")
+ end_time = time.time()
+ cprint(f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]")
+ print_result(result)
+
+ # Force to crawl again
+ cprint("2️⃣ Second crawl (Force to crawl again):", True)
+ start_time = time.time()
+ result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
+ end_time = time.time()
+ cprint(f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]")
+ print_result(result)
+
+def add_chunking_strategy(crawler):
+ # Adding a chunking strategy: RegexChunking
+ cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
+ cprint("RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!")
+ result = crawler.run(
+ url="https://www.nbcnews.com/business",
+ chunking_strategy=RegexChunking(patterns=["\n\n"])
+ )
+ cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
+ print_result(result)
+
+ # Adding another chunking strategy: NlpSentenceChunking
+ cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True)
+ cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!")
+ result = crawler.run(
+ url="https://www.nbcnews.com/business",
+ chunking_strategy=NlpSentenceChunking()
+ )
+ cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
+ print_result(result)
+
+def add_extraction_strategy(crawler):
+ # Adding an extraction strategy: CosineStrategy
+ cprint("\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]", True)
+ cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!")
+ result = crawler.run(
+ url="https://www.nbcnews.com/business",
+ extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold = 0.3, verbose=True)
+ )
+ cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
+ print_result(result)
+
+ # Using semantic_filter with CosineStrategy
+ cprint("You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!")
+ result = crawler.run(
+ url="https://www.nbcnews.com/business",
+ extraction_strategy=CosineStrategy(
+ semantic_filter="inflation rent prices",
+ )
+ )
+ cprint("[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]")
+ print_result(result)
+
+def add_llm_extraction_strategy(crawler):
+ # Adding an LLM extraction strategy without instructions
+ cprint("\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]", True)
+ cprint("LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!")
+ result = crawler.run(
+ url="https://www.nbcnews.com/business",
+ extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
+ )
+ cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]")
+ print_result(result)
+
+ # Adding an LLM extraction strategy with instructions
+ cprint("\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]", True)
+ cprint("Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!")
+ result = crawler.run(
+ url="https://www.nbcnews.com/business",
+ extraction_strategy=LLMExtractionStrategy(
+ provider="openai/gpt-4o",
+ api_token=os.getenv('OPENAI_API_KEY'),
+ instruction="I am interested in only financial news"
+ )
+ )
+ cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]")
+ print_result(result)
+
+ result = crawler.run(
+ url="https://www.nbcnews.com/business",
+ extraction_strategy=LLMExtractionStrategy(
+ provider="openai/gpt-4o",
+ api_token=os.getenv('OPENAI_API_KEY'),
+ instruction="Extract only content related to technology"
+ )
+ )
+ cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]")
+ print_result(result)
+
+def targeted_extraction(crawler):
+ # Using a CSS selector to extract only H2 tags
+ cprint("\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]", True)
+ result = crawler.run(
+ url="https://www.nbcnews.com/business",
+ css_selector="h2"
+ )
+ cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
+ print_result(result)
+
+def interactive_extraction(crawler):
+ # Passing JavaScript code to interact with the page
+ cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
+ cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
+ js_code = """
+ const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
+ loadMoreButton && loadMoreButton.click();
+ """
+ # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
+ # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
+ result = crawler.run(
+ url="https://www.nbcnews.com/business",
+ js = js_code
+ )
+ cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
+ print_result(result)
+
+def multiple_scrip(crawler):
+ # Passing JavaScript code to interact with the page
+ cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
+ cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
+ js_code = ["""
+ const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
+ loadMoreButton && loadMoreButton.click();
+ """] * 2
+ # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
+ # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
+ result = crawler.run(
+ url="https://www.nbcnews.com/business",
+ js = js_code
+ )
+ cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
+ print_result(result)
+
+def using_crawler_hooks(crawler):
+ # Example usage of the hooks for authentication and setting a cookie
+ def on_driver_created(driver):
+ print("[HOOK] on_driver_created")
+ # Example customization: maximize the window
+ driver.maximize_window()
+
+ # Example customization: logging in to a hypothetical website
+ driver.get('https://example.com/login')
+
+ from selenium.webdriver.support.ui import WebDriverWait
+ from selenium.webdriver.common.by import By
+ from selenium.webdriver.support import expected_conditions as EC
+
+ WebDriverWait(driver, 10).until(
+ EC.presence_of_element_located((By.NAME, 'username'))
+ )
+ driver.find_element(By.NAME, 'username').send_keys('testuser')
+ driver.find_element(By.NAME, 'password').send_keys('password123')
+ driver.find_element(By.NAME, 'login').click()
+ WebDriverWait(driver, 10).until(
+ EC.presence_of_element_located((By.ID, 'welcome'))
+ )
+ # Add a custom cookie
+ driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
+ return driver
+
+
+ def before_get_url(driver):
+ print("[HOOK] before_get_url")
+ # Example customization: add a custom header
+ # Enable Network domain for sending headers
+ driver.execute_cdp_cmd('Network.enable', {})
+ # Add a custom header
+ driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
+ return driver
+
+ def after_get_url(driver):
+ print("[HOOK] after_get_url")
+ # Example customization: log the URL
+ print(driver.current_url)
+ return driver
+
+ def before_return_html(driver, html):
+ print("[HOOK] before_return_html")
+ # Example customization: log the HTML
+ print(len(html))
+ return driver
+
+ cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
+
+ crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
+ crawler_strategy.set_hook('on_driver_created', on_driver_created)
+ crawler_strategy.set_hook('before_get_url', before_get_url)
+ crawler_strategy.set_hook('after_get_url', after_get_url)
+ crawler_strategy.set_hook('before_return_html', before_return_html)
+
+ crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
+ crawler.warmup()
+ result = crawler.run(url="https://example.com")
+
+ cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
+ print_result(result= result)
+
+def using_crawler_hooks_dleay_example(crawler):
+ def delay(driver):
+ print("Delaying for 5 seconds...")
+ time.sleep(5)
+ print("Resuming...")
+
+ def create_crawler():
+ crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
+ crawler_strategy.set_hook('after_get_url', delay)
+ crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
+ crawler.warmup()
+ return crawler
+
+ cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]")
+ crawler = create_crawler()
+ result = crawler.run(url="https://google.com", bypass_cache=True)
+
+ cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
+ print_result(result)
+
+
+
+def main():
+ cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
+ cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
+ cprint("If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files.")
+
+ crawler = create_crawler()
+
+ crawler.always_by_pass_cache = True
+ basic_usage(crawler)
+ # basic_usage_some_params(crawler)
+ understanding_parameters(crawler)
+
+ crawler.always_by_pass_cache = True
+ screenshot_usage(crawler)
+ add_chunking_strategy(crawler)
+ add_extraction_strategy(crawler)
+ add_llm_extraction_strategy(crawler)
+ targeted_extraction(crawler)
+ interactive_extraction(crawler)
+ multiple_scrip(crawler)
+
+ cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
+
+if __name__ == "__main__":
+ main()
+
diff --git a/docs/examples/quickstart_v0.ipynb b/docs/examples/quickstart_v0.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..71f23acb7d921a8bafe3f3126088a8d043de87e9
--- /dev/null
+++ b/docs/examples/quickstart_v0.ipynb
@@ -0,0 +1,735 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6yLvrXn7yZQI"
+ },
+ "source": [
+ "# Crawl4AI: Advanced Web Crawling and Data Extraction\n",
+ "\n",
+ "Welcome to this interactive notebook showcasing Crawl4AI, an advanced asynchronous web crawling and data extraction library.\n",
+ "\n",
+ "- GitHub Repository: [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)\n",
+ "- Twitter: [@unclecode](https://twitter.com/unclecode)\n",
+ "- Website: [https://crawl4ai.com](https://crawl4ai.com)\n",
+ "\n",
+ "Let's explore the powerful features of Crawl4AI!"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "KIn_9nxFyZQK"
+ },
+ "source": [
+ "## Installation\n",
+ "\n",
+ "First, let's install Crawl4AI from GitHub:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "mSnaxLf3zMog"
+ },
+ "outputs": [],
+ "source": [
+ "!sudo apt-get update && sudo apt-get install -y libwoff1 libopus0 libwebp6 libwebpdemux2 libenchant1c2a libgudev-1.0-0 libsecret-1-0 libhyphen0 libgdk-pixbuf2.0-0 libegl1 libnotify4 libxslt1.1 libevent-2.1-7 libgles2 libvpx6 libxcomposite1 libatk1.0-0 libatk-bridge2.0-0 libepoxy0 libgtk-3-0 libharfbuzz-icu0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "xlXqaRtayZQK"
+ },
+ "outputs": [],
+ "source": [
+ "!pip install crawl4ai\n",
+ "!pip install nest-asyncio\n",
+ "!playwright install"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "qKCE7TI7yZQL"
+ },
+ "source": [
+ "Now, let's import the necessary libraries:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "I67tr7aAyZQL"
+ },
+ "outputs": [],
+ "source": [
+ "import asyncio\n",
+ "import nest_asyncio\n",
+ "from crawl4ai import AsyncWebCrawler\n",
+ "from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy\n",
+ "import json\n",
+ "import time\n",
+ "from pydantic import BaseModel, Field\n",
+ "\n",
+ "nest_asyncio.apply()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "h7yR_Rt_yZQM"
+ },
+ "source": [
+ "## Basic Usage\n",
+ "\n",
+ "Let's start with a simple crawl example:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "yBh6hf4WyZQM",
+ "outputId": "0f83af5c-abba-4175-ed95-70b7512e6bcc"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
+ "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
+ "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.05 seconds\n",
+ "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.05 seconds.\n",
+ "18102\n"
+ ]
+ }
+ ],
+ "source": [
+ "async def simple_crawl():\n",
+ " async with AsyncWebCrawler(verbose=True) as crawler:\n",
+ " result = await crawler.arun(url=\"https://www.nbcnews.com/business\")\n",
+ " print(len(result.markdown))\n",
+ "await simple_crawl()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "9rtkgHI28uI4"
+ },
+ "source": [
+ "💡 By default, **Crawl4AI** caches the result of every URL, so the next time you call it, you’ll get an instant result. But if you want to bypass the cache, just set `bypass_cache=True`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "MzZ0zlJ9yZQM"
+ },
+ "source": [
+ "## Advanced Features\n",
+ "\n",
+ "### Executing JavaScript and Using CSS Selectors"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "gHStF86xyZQM",
+ "outputId": "34d0fb6d-4dec-4677-f76e-85a1f082829b"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
+ "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
+ "[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n",
+ "[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n",
+ "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 6.06 seconds\n",
+ "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.10 seconds\n",
+ "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
+ "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.11 seconds.\n",
+ "41135\n"
+ ]
+ }
+ ],
+ "source": [
+ "async def js_and_css():\n",
+ " async with AsyncWebCrawler(verbose=True) as crawler:\n",
+ " js_code = [\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"]\n",
+ " result = await crawler.arun(\n",
+ " url=\"https://www.nbcnews.com/business\",\n",
+ " js_code=js_code,\n",
+ " # css_selector=\"YOUR_CSS_SELECTOR_HERE\",\n",
+ " bypass_cache=True\n",
+ " )\n",
+ " print(len(result.markdown))\n",
+ "\n",
+ "await js_and_css()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "cqE_W4coyZQM"
+ },
+ "source": [
+ "### Using a Proxy\n",
+ "\n",
+ "Note: You'll need to replace the proxy URL with a working proxy for this example to run successfully."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "QjAyiAGqyZQM"
+ },
+ "outputs": [],
+ "source": [
+ "async def use_proxy():\n",
+ " async with AsyncWebCrawler(verbose=True, proxy=\"http://your-proxy-url:port\") as crawler:\n",
+ " result = await crawler.arun(\n",
+ " url=\"https://www.nbcnews.com/business\",\n",
+ " bypass_cache=True\n",
+ " )\n",
+ " print(result.markdown[:500]) # Print first 500 characters\n",
+ "\n",
+ "# Uncomment the following line to run the proxy example\n",
+ "# await use_proxy()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "XTZ88lbayZQN"
+ },
+ "source": [
+ "### Extracting Structured Data with OpenAI\n",
+ "\n",
+ "Note: You'll need to set your OpenAI API key as an environment variable for this example to work."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "fIOlDayYyZQN",
+ "outputId": "cb8359cc-dee0-4762-9698-5dfdcee055b8"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
+ "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
+ "[LOG] 🕸️ Crawling https://openai.com/api/pricing/ using AsyncPlaywrightCrawlerStrategy...\n",
+ "[LOG] ✅ Crawled https://openai.com/api/pricing/ successfully!\n",
+ "[LOG] 🚀 Crawling done for https://openai.com/api/pricing/, success: True, time taken: 3.77 seconds\n",
+ "[LOG] 🚀 Content extracted for https://openai.com/api/pricing/, success: True, time taken: 0.21 seconds\n",
+ "[LOG] 🔥 Extracting semantic blocks for https://openai.com/api/pricing/, Strategy: AsyncWebCrawler\n",
+ "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 0\n",
+ "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 1\n",
+ "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 2\n",
+ "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 3\n",
+ "[LOG] Extracted 4 blocks from URL: https://openai.com/api/pricing/ block index: 3\n",
+ "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 4\n",
+ "[LOG] Extracted 5 blocks from URL: https://openai.com/api/pricing/ block index: 0\n",
+ "[LOG] Extracted 1 blocks from URL: https://openai.com/api/pricing/ block index: 4\n",
+ "[LOG] Extracted 8 blocks from URL: https://openai.com/api/pricing/ block index: 1\n",
+ "[LOG] Extracted 12 blocks from URL: https://openai.com/api/pricing/ block index: 2\n",
+ "[LOG] 🚀 Extraction done for https://openai.com/api/pricing/, time taken: 8.55 seconds.\n",
+ "5029\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n",
+ "\n",
+ "class OpenAIModelFee(BaseModel):\n",
+ " model_name: str = Field(..., description=\"Name of the OpenAI model.\")\n",
+ " input_fee: str = Field(..., description=\"Fee for input token for the OpenAI model.\")\n",
+ " output_fee: str = Field(..., description=\"Fee for output token for the OpenAI model.\")\n",
+ "\n",
+ "async def extract_openai_fees():\n",
+ " async with AsyncWebCrawler(verbose=True) as crawler:\n",
+ " result = await crawler.arun(\n",
+ " url='https://openai.com/api/pricing/',\n",
+ " word_count_threshold=1,\n",
+ " extraction_strategy=LLMExtractionStrategy(\n",
+ " provider=\"openai/gpt-4o\", api_token=os.getenv('OPENAI_API_KEY'),\n",
+ " schema=OpenAIModelFee.schema(),\n",
+ " extraction_type=\"schema\",\n",
+ " instruction=\"\"\"From the crawled content, extract all mentioned model names along with their fees for input and output tokens.\n",
+ " Do not miss any models in the entire content. One extracted model JSON format should look like this:\n",
+ " {\"model_name\": \"GPT-4\", \"input_fee\": \"US$10.00 / 1M tokens\", \"output_fee\": \"US$30.00 / 1M tokens\"}.\"\"\"\n",
+ " ),\n",
+ " bypass_cache=True,\n",
+ " )\n",
+ " print(len(result.extracted_content))\n",
+ "\n",
+ "# Uncomment the following line to run the OpenAI extraction example\n",
+ "await extract_openai_fees()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "BypA5YxEyZQN"
+ },
+ "source": [
+ "### Advanced Multi-Page Crawling with JavaScript Execution"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "tfkcVQ0b7mw-"
+ },
+ "source": [
+ "## Advanced Multi-Page Crawling with JavaScript Execution\n",
+ "\n",
+ "This example demonstrates Crawl4AI's ability to handle complex crawling scenarios, specifically extracting commits from multiple pages of a GitHub repository. The challenge here is that clicking the \"Next\" button doesn't load a new page, but instead uses asynchronous JavaScript to update the content. This is a common hurdle in modern web crawling.\n",
+ "\n",
+ "To overcome this, we use Crawl4AI's custom JavaScript execution to simulate clicking the \"Next\" button, and implement a custom hook to detect when new data has loaded. Our strategy involves comparing the first commit's text before and after \"clicking\" Next, waiting until it changes to confirm new data has rendered. This showcases Crawl4AI's flexibility in handling dynamic content and its ability to implement custom logic for even the most challenging crawling tasks."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "qUBKGpn3yZQN",
+ "outputId": "3e555b6a-ed33-42f4-cce9-499a923fbe17"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
+ "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
+ "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
+ "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
+ "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 5.16 seconds\n",
+ "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.28 seconds\n",
+ "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
+ "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.28 seconds.\n",
+ "Page 1: Found 35 commits\n",
+ "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
+ "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
+ "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.78 seconds\n",
+ "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.90 seconds\n",
+ "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
+ "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.90 seconds.\n",
+ "Page 2: Found 35 commits\n",
+ "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
+ "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
+ "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 2.00 seconds\n",
+ "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.74 seconds\n",
+ "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
+ "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.75 seconds.\n",
+ "Page 3: Found 35 commits\n",
+ "Successfully crawled 105 commits across 3 pages\n"
+ ]
+ }
+ ],
+ "source": [
+ "import re\n",
+ "from bs4 import BeautifulSoup\n",
+ "\n",
+ "async def crawl_typescript_commits():\n",
+ " first_commit = \"\"\n",
+ " async def on_execution_started(page):\n",
+ " nonlocal first_commit\n",
+ " try:\n",
+ " while True:\n",
+ " await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')\n",
+ " commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')\n",
+ " commit = await commit.evaluate('(element) => element.textContent')\n",
+ " commit = re.sub(r'\\s+', '', commit)\n",
+ " if commit and commit != first_commit:\n",
+ " first_commit = commit\n",
+ " break\n",
+ " await asyncio.sleep(0.5)\n",
+ " except Exception as e:\n",
+ " print(f\"Warning: New content didn't appear after JavaScript execution: {e}\")\n",
+ "\n",
+ " async with AsyncWebCrawler(verbose=True) as crawler:\n",
+ " crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)\n",
+ "\n",
+ " url = \"https://github.com/microsoft/TypeScript/commits/main\"\n",
+ " session_id = \"typescript_commits_session\"\n",
+ " all_commits = []\n",
+ "\n",
+ " js_next_page = \"\"\"\n",
+ " const button = document.querySelector('a[data-testid=\"pagination-next-button\"]');\n",
+ " if (button) button.click();\n",
+ " \"\"\"\n",
+ "\n",
+ " for page in range(3): # Crawl 3 pages\n",
+ " result = await crawler.arun(\n",
+ " url=url,\n",
+ " session_id=session_id,\n",
+ " css_selector=\"li.Box-sc-g0xbh4-0\",\n",
+ " js=js_next_page if page > 0 else None,\n",
+ " bypass_cache=True,\n",
+ " js_only=page > 0\n",
+ " )\n",
+ "\n",
+ " assert result.success, f\"Failed to crawl page {page + 1}\"\n",
+ "\n",
+ " soup = BeautifulSoup(result.cleaned_html, 'html.parser')\n",
+ " commits = soup.select(\"li\")\n",
+ " all_commits.extend(commits)\n",
+ "\n",
+ " print(f\"Page {page + 1}: Found {len(commits)} commits\")\n",
+ "\n",
+ " await crawler.crawler_strategy.kill_session(session_id)\n",
+ " print(f\"Successfully crawled {len(all_commits)} commits across 3 pages\")\n",
+ "\n",
+ "await crawl_typescript_commits()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "EJRnYsp6yZQN"
+ },
+ "source": [
+ "### Using JsonCssExtractionStrategy for Fast Structured Output"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "1ZMqIzB_8SYp"
+ },
+ "source": [
+ "The JsonCssExtractionStrategy is a powerful feature of Crawl4AI that allows for precise, structured data extraction from web pages. Here's how it works:\n",
+ "\n",
+ "1. You define a schema that describes the pattern of data you're interested in extracting.\n",
+ "2. The schema includes a base selector that identifies repeating elements on the page.\n",
+ "3. Within the schema, you define fields, each with its own selector and type.\n",
+ "4. These field selectors are applied within the context of each base selector element.\n",
+ "5. The strategy supports nested structures, lists within lists, and various data types.\n",
+ "6. You can even include computed fields for more complex data manipulation.\n",
+ "\n",
+ "This approach allows for highly flexible and precise data extraction, transforming semi-structured web content into clean, structured JSON data. It's particularly useful for extracting consistent data patterns from pages like product listings, news articles, or search results.\n",
+ "\n",
+ "For more details and advanced usage, check out the full documentation on the Crawl4AI website."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "trCMR2T9yZQN",
+ "outputId": "718d36f4-cccf-40f4-8d8c-c3ba73524d16"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
+ "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
+ "[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n",
+ "[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n",
+ "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 7.00 seconds\n",
+ "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.32 seconds\n",
+ "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
+ "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.48 seconds.\n",
+ "Successfully extracted 11 news teasers\n",
+ "{\n",
+ " \"category\": \"Business News\",\n",
+ " \"headline\": \"NBC ripped up its Olympics playbook for 2024 \\u2014 so far, the new strategy paid off\",\n",
+ " \"summary\": \"The Olympics have long been key to NBCUniversal. Paris marked the 18th Olympic Games broadcast by NBC in the U.S.\",\n",
+ " \"time\": \"13h ago\",\n",
+ " \"image\": {\n",
+ " \"src\": \"https://media-cldnry.s-nbcnews.com/image/upload/t_focal-200x100,f_auto,q_auto:best/rockcms/2024-09/240903-nbc-olympics-ch-1344-c7a486.jpg\",\n",
+ " \"alt\": \"Mike Tirico.\"\n",
+ " },\n",
+ " \"link\": \"https://www.nbcnews.com/business\"\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "async def extract_news_teasers():\n",
+ " schema = {\n",
+ " \"name\": \"News Teaser Extractor\",\n",
+ " \"baseSelector\": \".wide-tease-item__wrapper\",\n",
+ " \"fields\": [\n",
+ " {\n",
+ " \"name\": \"category\",\n",
+ " \"selector\": \".unibrow span[data-testid='unibrow-text']\",\n",
+ " \"type\": \"text\",\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"headline\",\n",
+ " \"selector\": \".wide-tease-item__headline\",\n",
+ " \"type\": \"text\",\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"summary\",\n",
+ " \"selector\": \".wide-tease-item__description\",\n",
+ " \"type\": \"text\",\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"time\",\n",
+ " \"selector\": \"[data-testid='wide-tease-date']\",\n",
+ " \"type\": \"text\",\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"image\",\n",
+ " \"type\": \"nested\",\n",
+ " \"selector\": \"picture.teasePicture img\",\n",
+ " \"fields\": [\n",
+ " {\"name\": \"src\", \"type\": \"attribute\", \"attribute\": \"src\"},\n",
+ " {\"name\": \"alt\", \"type\": \"attribute\", \"attribute\": \"alt\"},\n",
+ " ],\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"link\",\n",
+ " \"selector\": \"a[href]\",\n",
+ " \"type\": \"attribute\",\n",
+ " \"attribute\": \"href\",\n",
+ " },\n",
+ " ],\n",
+ " }\n",
+ "\n",
+ " extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)\n",
+ "\n",
+ " async with AsyncWebCrawler(verbose=True) as crawler:\n",
+ " result = await crawler.arun(\n",
+ " url=\"https://www.nbcnews.com/business\",\n",
+ " extraction_strategy=extraction_strategy,\n",
+ " bypass_cache=True,\n",
+ " )\n",
+ "\n",
+ " assert result.success, \"Failed to crawl the page\"\n",
+ "\n",
+ " news_teasers = json.loads(result.extracted_content)\n",
+ " print(f\"Successfully extracted {len(news_teasers)} news teasers\")\n",
+ " print(json.dumps(news_teasers[0], indent=2))\n",
+ "\n",
+ "await extract_news_teasers()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "FnyVhJaByZQN"
+ },
+ "source": [
+ "## Speed Comparison\n",
+ "\n",
+ "Let's compare the speed of Crawl4AI with Firecrawl, a paid service. Note that we can't run Firecrawl in this Colab environment, so we'll simulate its performance based on previously recorded data."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "agDD186f3wig"
+ },
+ "source": [
+ "💡 **Note on Speed Comparison:**\n",
+ "\n",
+ "The speed test conducted here is running on Google Colab, where the internet speed and performance can vary and may not reflect optimal conditions. When we call Firecrawl's API, we're seeing its best performance, while Crawl4AI's performance is limited by Colab's network speed.\n",
+ "\n",
+ "For a more accurate comparison, it's recommended to run these tests on your own servers or computers with a stable and fast internet connection. Despite these limitations, Crawl4AI still demonstrates faster performance in this environment.\n",
+ "\n",
+ "If you run these tests locally, you may observe an even more significant speed advantage for Crawl4AI compared to other services."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "F7KwHv8G1LbY"
+ },
+ "outputs": [],
+ "source": [
+ "!pip install firecrawl"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "91813zILyZQN",
+ "outputId": "663223db-ab89-4976-b233-05ceca62b19b"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Firecrawl (simulated):\n",
+ "Time taken: 4.38 seconds\n",
+ "Content length: 41967 characters\n",
+ "Images found: 49\n",
+ "\n",
+ "Crawl4AI (simple crawl):\n",
+ "Time taken: 4.22 seconds\n",
+ "Content length: 18221 characters\n",
+ "Images found: 49\n",
+ "\n",
+ "Crawl4AI (with JavaScript execution):\n",
+ "Time taken: 9.13 seconds\n",
+ "Content length: 34243 characters\n",
+ "Images found: 89\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "from google.colab import userdata\n",
+ "os.environ['FIRECRAWL_API_KEY'] = userdata.get('FIRECRAWL_API_KEY')\n",
+ "import time\n",
+ "from firecrawl import FirecrawlApp\n",
+ "\n",
+ "async def speed_comparison():\n",
+ " # Simulated Firecrawl performance\n",
+ " app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])\n",
+ " start = time.time()\n",
+ " scrape_status = app.scrape_url(\n",
+ " 'https://www.nbcnews.com/business',\n",
+ " params={'formats': ['markdown', 'html']}\n",
+ " )\n",
+ " end = time.time()\n",
+ " print(\"Firecrawl (simulated):\")\n",
+ " print(f\"Time taken: {end - start:.2f} seconds\")\n",
+ " print(f\"Content length: {len(scrape_status['markdown'])} characters\")\n",
+ " print(f\"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}\")\n",
+ " print()\n",
+ "\n",
+ " async with AsyncWebCrawler() as crawler:\n",
+ " # Crawl4AI simple crawl\n",
+ " start = time.time()\n",
+ " result = await crawler.arun(\n",
+ " url=\"https://www.nbcnews.com/business\",\n",
+ " word_count_threshold=0,\n",
+ " bypass_cache=True,\n",
+ " verbose=False\n",
+ " )\n",
+ " end = time.time()\n",
+ " print(\"Crawl4AI (simple crawl):\")\n",
+ " print(f\"Time taken: {end - start:.2f} seconds\")\n",
+ " print(f\"Content length: {len(result.markdown)} characters\")\n",
+ " print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n",
+ " print()\n",
+ "\n",
+ " # Crawl4AI with JavaScript execution\n",
+ " start = time.time()\n",
+ " result = await crawler.arun(\n",
+ " url=\"https://www.nbcnews.com/business\",\n",
+ " js_code=[\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"],\n",
+ " word_count_threshold=0,\n",
+ " bypass_cache=True,\n",
+ " verbose=False\n",
+ " )\n",
+ " end = time.time()\n",
+ " print(\"Crawl4AI (with JavaScript execution):\")\n",
+ " print(f\"Time taken: {end - start:.2f} seconds\")\n",
+ " print(f\"Content length: {len(result.markdown)} characters\")\n",
+ " print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n",
+ "\n",
+ "await speed_comparison()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "OBFFYVJIyZQN"
+ },
+ "source": [
+ "If you run on a local machine with a proper internet speed:\n",
+ "- Simple crawl: Crawl4AI is typically over 3-4 times faster than Firecrawl.\n",
+ "- With JavaScript execution: Even when executing JavaScript to load more content (potentially doubling the number of images found), Crawl4AI is still faster than Firecrawl's simple crawl.\n",
+ "\n",
+ "Please note that actual performance may vary depending on network conditions and the specific content being crawled."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "A6_1RK1_yZQO"
+ },
+ "source": [
+ "## Conclusion\n",
+ "\n",
+ "In this notebook, we've explored the powerful features of Crawl4AI, including:\n",
+ "\n",
+ "1. Basic crawling\n",
+ "2. JavaScript execution and CSS selector usage\n",
+ "3. Proxy support\n",
+ "4. Structured data extraction with OpenAI\n",
+ "5. Advanced multi-page crawling with JavaScript execution\n",
+ "6. Fast structured output using JsonCssExtractionStrategy\n",
+ "7. Speed comparison with other services\n",
+ "\n",
+ "Crawl4AI offers a fast, flexible, and powerful solution for web crawling and data extraction tasks. Its asynchronous architecture and advanced features make it suitable for a wide range of applications, from simple web scraping to complex, multi-page data extraction scenarios.\n",
+ "\n",
+ "For more information and advanced usage, please visit the [Crawl4AI documentation](https://crawl4ai.com/mkdocs/).\n",
+ "\n",
+ "Happy crawling!"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/docs/examples/research_assistant.py b/docs/examples/research_assistant.py
new file mode 100644
index 0000000000000000000000000000000000000000..de35ce8455606cac13fda96fc6a44f006a9a6822
--- /dev/null
+++ b/docs/examples/research_assistant.py
@@ -0,0 +1,195 @@
+# Make sure to install the required packageschainlit and groq
+import os, time
+from openai import AsyncOpenAI
+import chainlit as cl
+import re
+import requests
+from io import BytesIO
+from chainlit.element import ElementBased
+from groq import Groq
+
+# Import threadpools to run the crawl_url function in a separate thread
+from concurrent.futures import ThreadPoolExecutor
+
+client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
+
+# Instrument the OpenAI client
+cl.instrument_openai()
+
+settings = {
+ "model": "llama3-8b-8192",
+ "temperature": 0.5,
+ "max_tokens": 500,
+ "top_p": 1,
+ "frequency_penalty": 0,
+ "presence_penalty": 0,
+}
+
+def extract_urls(text):
+ url_pattern = re.compile(r'(https?://\S+)')
+ return url_pattern.findall(text)
+
+def crawl_url(url):
+ data = {
+ "urls": [url],
+ "include_raw_html": True,
+ "word_count_threshold": 10,
+ "extraction_strategy": "NoExtractionStrategy",
+ "chunking_strategy": "RegexChunking"
+ }
+ response = requests.post("https://crawl4ai.com/crawl", json=data)
+ response_data = response.json()
+ response_data = response_data['results'][0]
+ return response_data['markdown']
+
+@cl.on_chat_start
+async def on_chat_start():
+ cl.user_session.set("session", {
+ "history": [],
+ "context": {}
+ })
+ await cl.Message(
+ content="Welcome to the chat! How can I assist you today?"
+ ).send()
+
+@cl.on_message
+async def on_message(message: cl.Message):
+ user_session = cl.user_session.get("session")
+
+ # Extract URLs from the user's message
+ urls = extract_urls(message.content)
+
+
+ futures = []
+ with ThreadPoolExecutor() as executor:
+ for url in urls:
+ futures.append(executor.submit(crawl_url, url))
+
+ results = [future.result() for future in futures]
+
+ for url, result in zip(urls, results):
+ ref_number = f"REF_{len(user_session['context']) + 1}"
+ user_session["context"][ref_number] = {
+ "url": url,
+ "content": result
+ }
+
+
+ user_session["history"].append({
+ "role": "user",
+ "content": message.content
+ })
+
+ # Create a system message that includes the context
+ context_messages = [
+ f'\n{data["content"]}\n '
+ for ref, data in user_session["context"].items()
+ ]
+ if context_messages:
+ system_message = {
+ "role": "system",
+ "content": (
+ "You are a helpful bot. Use the following context for answering questions. "
+ "Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n"
+ "If the question requires any information from the provided appendices or context, refer to the sources. "
+ "If not, there is no need to add a references section. "
+ "At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
+ "\n\n".join(context_messages)
+ )
+ }
+ else:
+ system_message = {
+ "role": "system",
+ "content": "You are a helpful assistant."
+ }
+
+
+ msg = cl.Message(content="")
+ await msg.send()
+
+ # Get response from the LLM
+ stream = await client.chat.completions.create(
+ messages=[
+ system_message,
+ *user_session["history"]
+ ],
+ stream=True,
+ **settings
+ )
+
+ assistant_response = ""
+ async for part in stream:
+ if token := part.choices[0].delta.content:
+ assistant_response += token
+ await msg.stream_token(token)
+
+ # Add assistant message to the history
+ user_session["history"].append({
+ "role": "assistant",
+ "content": assistant_response
+ })
+ await msg.update()
+
+ # Append the reference section to the assistant's response
+ reference_section = "\n\nReferences:\n"
+ for ref, data in user_session["context"].items():
+ reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n"
+
+ msg.content += reference_section
+ await msg.update()
+
+
+@cl.on_audio_chunk
+async def on_audio_chunk(chunk: cl.AudioChunk):
+ if chunk.isStart:
+ buffer = BytesIO()
+ # This is required for whisper to recognize the file type
+ buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
+ # Initialize the session for a new audio stream
+ cl.user_session.set("audio_buffer", buffer)
+ cl.user_session.set("audio_mime_type", chunk.mimeType)
+
+ # Write the chunks to a buffer and transcribe the whole audio at the end
+ cl.user_session.get("audio_buffer").write(chunk.data)
+
+ pass
+
+@cl.step(type="tool")
+async def speech_to_text(audio_file):
+ cli = Groq()
+
+ response = await client.audio.transcriptions.create(
+ model="whisper-large-v3", file=audio_file
+ )
+
+ return response.text
+
+
+@cl.on_audio_end
+async def on_audio_end(elements: list[ElementBased]):
+ # Get the audio buffer from the session
+ audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
+ audio_buffer.seek(0) # Move the file pointer to the beginning
+ audio_file = audio_buffer.read()
+ audio_mime_type: str = cl.user_session.get("audio_mime_type")
+
+ start_time = time.time()
+ whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
+ transcription = await speech_to_text(whisper_input)
+ end_time = time.time()
+ print(f"Transcription took {end_time - start_time} seconds")
+
+ user_msg = cl.Message(
+ author="You",
+ type="user_message",
+ content=transcription
+ )
+ await user_msg.send()
+ await on_message(user_msg)
+
+
+if __name__ == "__main__":
+ from chainlit.cli import run_chainlit
+ run_chainlit(__file__)
+
+
diff --git a/docs/examples/rest_call.py b/docs/examples/rest_call.py
new file mode 100644
index 0000000000000000000000000000000000000000..465c61142992d340e7275a6efd4b8c7627cefb17
--- /dev/null
+++ b/docs/examples/rest_call.py
@@ -0,0 +1,64 @@
+
+import requests, base64, os
+
+data = {
+ "urls": ["https://www.nbcnews.com/business"],
+ "screenshot": True,
+}
+
+response = requests.post("https://crawl4ai.com/crawl", json=data)
+result = response.json()['results'][0]
+print(result.keys())
+# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
+# 'links', 'screenshot', 'markdown', 'extracted_content',
+# 'metadata', 'error_message'])
+with open("screenshot.png", "wb") as f:
+ f.write(base64.b64decode(result['screenshot']))
+
+# Example of filtering the content using CSS selectors
+data = {
+ "urls": [
+ "https://www.nbcnews.com/business"
+ ],
+ "css_selector": "article",
+ "screenshot": True,
+}
+
+# Example of executing a JS script on the page before extracting the content
+data = {
+ "urls": [
+ "https://www.nbcnews.com/business"
+ ],
+ "screenshot": True,
+ 'js' : ["""
+ const loadMoreButton = Array.from(document.querySelectorAll('button')).
+ find(button => button.textContent.includes('Load More'));
+ loadMoreButton && loadMoreButton.click();
+ """]
+}
+
+# Example of using a custom extraction strategy
+data = {
+ "urls": [
+ "https://www.nbcnews.com/business"
+ ],
+ "extraction_strategy": "CosineStrategy",
+ "extraction_strategy_args": {
+ "semantic_filter": "inflation rent prices"
+ },
+}
+
+# Example of using LLM to extract content
+data = {
+ "urls": [
+ "https://www.nbcnews.com/business"
+ ],
+ "extraction_strategy": "LLMExtractionStrategy",
+ "extraction_strategy_args": {
+ "provider": "groq/llama3-8b-8192",
+ "api_token": os.environ.get("GROQ_API_KEY"),
+ "instruction": """I am interested in only financial news,
+ and translate them in French."""
+ },
+}
+
diff --git a/docs/examples/sample_ecommerce.html b/docs/examples/sample_ecommerce.html
new file mode 100644
index 0000000000000000000000000000000000000000..4698d9c69ba6ea36fd4709b7daca3ab78ec7f0fb
--- /dev/null
+++ b/docs/examples/sample_ecommerce.html
@@ -0,0 +1,106 @@
+
+
+
+
+
+ Sample E-commerce Page for JsonCssExtractionStrategy Testing
+
+
+
+ Sample E-commerce Product Catalog
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/examples/ssl_example.py b/docs/examples/ssl_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..410e9485e4b17a1dd2af901f32eea283d6db24ea
--- /dev/null
+++ b/docs/examples/ssl_example.py
@@ -0,0 +1,46 @@
+"""Example showing how to work with SSL certificates in Crawl4AI."""
+
+import asyncio
+import os
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+# Create tmp directory if it doesn't exist
+parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+tmp_dir = os.path.join(parent_dir, "tmp")
+os.makedirs(tmp_dir, exist_ok=True)
+
+async def main():
+ # Configure crawler to fetch SSL certificate
+ config = CrawlerRunConfig(
+ fetch_ssl_certificate=True,
+ cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url='https://example.com',
+ config=config
+ )
+
+ if result.success and result.ssl_certificate:
+ cert = result.ssl_certificate
+
+ # 1. Access certificate properties directly
+ print("\nCertificate Information:")
+ print(f"Issuer: {cert.issuer.get('CN', '')}")
+ print(f"Valid until: {cert.valid_until}")
+ print(f"Fingerprint: {cert.fingerprint}")
+
+ # 2. Export certificate in different formats
+ cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
+ print("\nCertificate exported to:")
+ print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
+
+ pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers
+ print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
+
+ der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps
+ print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/docs/examples/storage_state_tutorial.md b/docs/examples/storage_state_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..304e6399ad906f66c103b9b5b7789fb76e369d00
--- /dev/null
+++ b/docs/examples/storage_state_tutorial.md
@@ -0,0 +1,225 @@
+### Using `storage_state` to Pre-Load Cookies and LocalStorage
+
+Crawl4ai’s `AsyncWebCrawler` lets you preserve and reuse session data, including cookies and localStorage, across multiple runs. By providing a `storage_state`, you can start your crawls already “logged in” or with any other necessary session data—no need to repeat the login flow every time.
+
+#### What is `storage_state`?
+
+`storage_state` can be:
+
+- A dictionary containing cookies and localStorage data.
+- A path to a JSON file that holds this information.
+
+When you pass `storage_state` to the crawler, it applies these cookies and localStorage entries before loading any pages. This means your crawler effectively starts in a known authenticated or pre-configured state.
+
+#### Example Structure
+
+Here’s an example storage state:
+
+```json
+{
+ "cookies": [
+ {
+ "name": "session",
+ "value": "abcd1234",
+ "domain": "example.com",
+ "path": "/",
+ "expires": 1675363572.037711,
+ "httpOnly": false,
+ "secure": false,
+ "sameSite": "None"
+ }
+ ],
+ "origins": [
+ {
+ "origin": "https://example.com",
+ "localStorage": [
+ { "name": "token", "value": "my_auth_token" },
+ { "name": "refreshToken", "value": "my_refresh_token" }
+ ]
+ }
+ ]
+}
+```
+
+This JSON sets a `session` cookie and two localStorage entries (`token` and `refreshToken`) for `https://example.com`.
+
+---
+
+### Passing `storage_state` as a Dictionary
+
+You can directly provide the data as a dictionary:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+ storage_dict = {
+ "cookies": [
+ {
+ "name": "session",
+ "value": "abcd1234",
+ "domain": "example.com",
+ "path": "/",
+ "expires": 1675363572.037711,
+ "httpOnly": False,
+ "secure": False,
+ "sameSite": "None"
+ }
+ ],
+ "origins": [
+ {
+ "origin": "https://example.com",
+ "localStorage": [
+ {"name": "token", "value": "my_auth_token"},
+ {"name": "refreshToken", "value": "my_refresh_token"}
+ ]
+ }
+ ]
+ }
+
+ async with AsyncWebCrawler(
+ headless=True,
+ storage_state=storage_dict
+ ) as crawler:
+ result = await crawler.arun(url='https://example.com/protected')
+ if result.success:
+ print("Crawl succeeded with pre-loaded session data!")
+ print("Page HTML length:", len(result.html))
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+---
+
+### Passing `storage_state` as a File
+
+If you prefer a file-based approach, save the JSON above to `mystate.json` and reference it:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+ async with AsyncWebCrawler(
+ headless=True,
+ storage_state="mystate.json" # Uses a JSON file instead of a dictionary
+ ) as crawler:
+ result = await crawler.arun(url='https://example.com/protected')
+ if result.success:
+ print("Crawl succeeded with pre-loaded session data!")
+ print("Page HTML length:", len(result.html))
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+---
+
+### Using `storage_state` to Avoid Repeated Logins (Sign In Once, Use Later)
+
+A common scenario is when you need to log in to a site (entering username/password, etc.) to access protected pages. Doing so every crawl is cumbersome. Instead, you can:
+
+1. Perform the login once in a hook.
+2. After login completes, export the resulting `storage_state` to a file.
+3. On subsequent runs, provide that `storage_state` to skip the login step.
+
+**Step-by-Step Example:**
+
+**First Run (Perform Login and Save State):**
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def on_browser_created_hook(browser):
+ # Access the default context and create a page
+ context = browser.contexts[0]
+ page = await context.new_page()
+
+ # Navigate to the login page
+ await page.goto("https://example.com/login", wait_until="domcontentloaded")
+
+ # Fill in credentials and submit
+ await page.fill("input[name='username']", "myuser")
+ await page.fill("input[name='password']", "mypassword")
+ await page.click("button[type='submit']")
+ await page.wait_for_load_state("networkidle")
+
+ # Now the site sets tokens in localStorage and cookies
+ # Export this state to a file so we can reuse it
+ await context.storage_state(path="my_storage_state.json")
+ await page.close()
+
+async def main():
+ # First run: perform login and export the storage_state
+ async with AsyncWebCrawler(
+ headless=True,
+ verbose=True,
+ hooks={"on_browser_created": on_browser_created_hook},
+ use_persistent_context=True,
+ user_data_dir="./my_user_data"
+ ) as crawler:
+
+ # After on_browser_created_hook runs, we have storage_state saved to my_storage_state.json
+ result = await crawler.arun(
+ url='https://example.com/protected-page',
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
+ )
+ print("First run result success:", result.success)
+ if result.success:
+ print("Protected page HTML length:", len(result.html))
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+**Second Run (Reuse Saved State, No Login Needed):**
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def main():
+ # Second run: no need to hook on_browser_created this time.
+ # Just provide the previously saved storage state.
+ async with AsyncWebCrawler(
+ headless=True,
+ verbose=True,
+ use_persistent_context=True,
+ user_data_dir="./my_user_data",
+ storage_state="my_storage_state.json" # Reuse previously exported state
+ ) as crawler:
+
+ # Now the crawler starts already logged in
+ result = await crawler.arun(
+ url='https://example.com/protected-page',
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
+ )
+ print("Second run result success:", result.success)
+ if result.success:
+ print("Protected page HTML length:", len(result.html))
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+**What’s Happening Here?**
+
+- During the first run, the `on_browser_created_hook` logs into the site.
+- After logging in, the crawler exports the current session (cookies, localStorage, etc.) to `my_storage_state.json`.
+- On subsequent runs, passing `storage_state="my_storage_state.json"` starts the browser context with these tokens already in place, skipping the login steps.
+
+**Sign Out Scenario:**
+If the website allows you to sign out by clearing tokens or by navigating to a sign-out URL, you can also run a script that uses `on_browser_created_hook` or `arun` to simulate signing out, then export the resulting `storage_state` again. That would give you a baseline “logged out” state to start fresh from next time.
+
+---
+
+### Conclusion
+
+By using `storage_state`, you can skip repetitive actions, like logging in, and jump straight into crawling protected content. Whether you provide a file path or a dictionary, this powerful feature helps maintain state between crawls, simplifying your data extraction pipelines.
\ No newline at end of file
diff --git a/docs/examples/summarize_page.py b/docs/examples/summarize_page.py
new file mode 100644
index 0000000000000000000000000000000000000000..8515899970a3d02301c62803dd99999a0cfa42be
--- /dev/null
+++ b/docs/examples/summarize_page.py
@@ -0,0 +1,46 @@
+import os
+import time
+import json
+from crawl4ai.web_crawler import WebCrawler
+from crawl4ai.chunking_strategy import *
+from crawl4ai.extraction_strategy import *
+from crawl4ai.crawler_strategy import *
+
+url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot'
+
+crawler = WebCrawler()
+crawler.warmup()
+
+from pydantic import BaseModel, Field
+
+class PageSummary(BaseModel):
+ title: str = Field(..., description="Title of the page.")
+ summary: str = Field(..., description="Summary of the page.")
+ brief_summary: str = Field(..., description="Brief summary of the page.")
+ keywords: list = Field(..., description="Keywords assigned to the page.")
+
+result = crawler.run(
+ url=url,
+ word_count_threshold=1,
+ extraction_strategy= LLMExtractionStrategy(
+ provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
+ schema=PageSummary.model_json_schema(),
+ extraction_type="schema",
+ apply_chunking =False,
+ instruction="From the crawled content, extract the following details: "\
+ "1. Title of the page "\
+ "2. Summary of the page, which is a detailed summary "\
+ "3. Brief summary of the page, which is a paragraph text "\
+ "4. Keywords assigned to the page, which is a list of keywords. "\
+ 'The extracted JSON format should look like this: '\
+ '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }'
+ ),
+ bypass_cache=True,
+)
+
+page_summary = json.loads(result.extracted_content)
+
+print(page_summary)
+
+with open(".data/page_summary.json", "w", encoding="utf-8") as f:
+ f.write(result.extracted_content)
diff --git a/docs/examples/tutorial_dynamic_clicks.md b/docs/examples/tutorial_dynamic_clicks.md
new file mode 100644
index 0000000000000000000000000000000000000000..d9669952b4a63e47d71ea71d9396d3d77ae3888b
--- /dev/null
+++ b/docs/examples/tutorial_dynamic_clicks.md
@@ -0,0 +1,117 @@
+# Tutorial: Clicking Buttons to Load More Content with Crawl4AI
+
+## Introduction
+
+When scraping dynamic websites, it’s common to encounter “Load More” or “Next” buttons that must be clicked to reveal new content. Crawl4AI provides a straightforward way to handle these situations using JavaScript execution and waiting conditions. In this tutorial, we’ll cover two approaches:
+
+1. **Step-by-step (Session-based) Approach:** Multiple calls to `arun()` to progressively load more content.
+2. **Single-call Approach:** Execute a more complex JavaScript snippet inside a single `arun()` call to handle all clicks at once before the extraction.
+
+## Prerequisites
+
+- A working installation of Crawl4AI
+- Basic familiarity with Python’s `async`/`await` syntax
+
+## Step-by-Step Approach
+
+Use a session ID to maintain state across multiple `arun()` calls:
+
+```python
+from crawl4ai import AsyncWebCrawler, CacheMode
+
+js_code = [
+ # This JS finds the “Next” button and clicks it
+ "const nextButton = document.querySelector('button.next'); nextButton && nextButton.click();"
+]
+
+wait_for_condition = "css:.new-content-class"
+
+async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
+ # 1. Load the initial page
+ result_initial = await crawler.arun(
+ url="https://example.com",
+ cache_mode=CacheMode.BYPASS,
+ session_id="my_session"
+ )
+
+ # 2. Click the 'Next' button and wait for new content
+ result_next = await crawler.arun(
+ url="https://example.com",
+ session_id="my_session",
+ js_code=js_code,
+ wait_for=wait_for_condition,
+ js_only=True,
+ cache_mode=CacheMode.BYPASS
+ )
+
+# `result_next` now contains the updated HTML after clicking 'Next'
+```
+
+**Key Points:**
+- **`session_id`**: Keeps the same browser context open.
+- **`js_code`**: Executes JavaScript in the context of the already loaded page.
+- **`wait_for`**: Ensures the crawler waits until new content is fully loaded.
+- **`js_only=True`**: Runs the JS in the current session without reloading the page.
+
+By repeating the `arun()` call multiple times and modifying the `js_code` (e.g., clicking different modules or pages), you can iteratively load all the desired content.
+
+## Single-call Approach
+
+If the page allows it, you can run a single `arun()` call with a more elaborate JavaScript snippet that:
+- Iterates over all the modules or "Next" buttons
+- Clicks them one by one
+- Waits for content updates between each click
+- Once done, returns control to Crawl4AI for extraction.
+
+Example snippet:
+
+```python
+from crawl4ai import AsyncWebCrawler, CacheMode
+
+js_code = [
+ # Example JS that clicks multiple modules:
+ """
+ (async () => {
+ const modules = document.querySelectorAll('.module-item');
+ for (let i = 0; i < modules.length; i++) {
+ modules[i].scrollIntoView();
+ modules[i].click();
+ // Wait for each module’s content to load, adjust 100ms as needed
+ await new Promise(r => setTimeout(r, 100));
+ }
+ })();
+ """
+]
+
+async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
+ result = await crawler.arun(
+ url="https://example.com",
+ js_code=js_code,
+ wait_for="css:.final-loaded-content-class",
+ cache_mode=CacheMode.BYPASS
+ )
+
+# `result` now contains all content after all modules have been clicked in one go.
+```
+
+**Key Points:**
+- All interactions (clicks and waits) happen before the extraction.
+- Ideal for pages where all steps can be done in a single pass.
+
+## Choosing the Right Approach
+
+- **Step-by-Step (Session-based)**:
+ - Good when you need fine-grained control or must dynamically check conditions before clicking the next page.
+ - Useful if the page requires multiple conditions checked at runtime.
+
+- **Single-call**:
+ - Perfect if the sequence of interactions is known in advance.
+ - Cleaner code if the page’s structure is consistent and predictable.
+
+## Conclusion
+
+Crawl4AI makes it easy to handle dynamic content:
+- Use session IDs and multiple `arun()` calls for stepwise crawling.
+- Or pack all actions into one `arun()` call if the interactions are well-defined upfront.
+
+This flexibility ensures you can handle a wide range of dynamic web pages efficiently.
diff --git a/docs/examples/v0.3.74.overview.py b/docs/examples/v0.3.74.overview.py
new file mode 100644
index 0000000000000000000000000000000000000000..362ae8fc4446a9173e0dd91c6f0f9d995c6b7d53
--- /dev/null
+++ b/docs/examples/v0.3.74.overview.py
@@ -0,0 +1,277 @@
+import os, sys
+# append the parent directory to the sys.path
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+parent_parent_dir = os.path.dirname(parent_dir)
+sys.path.append(parent_parent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+__data__ = os.path.join(__location__, "__data")
+import asyncio
+from pathlib import Path
+import aiohttp
+import json
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.content_filter_strategy import BM25ContentFilter
+
+# 1. File Download Processing Example
+async def download_example():
+ """Example of downloading files from Python.org"""
+ # downloads_path = os.path.join(os.getcwd(), "downloads")
+ downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
+ os.makedirs(downloads_path, exist_ok=True)
+
+ print(f"Downloads will be saved to: {downloads_path}")
+
+ async with AsyncWebCrawler(
+ accept_downloads=True,
+ downloads_path=downloads_path,
+ verbose=True
+ ) as crawler:
+ result = await crawler.arun(
+ url="https://www.python.org/downloads/",
+ js_code="""
+ // Find and click the first Windows installer link
+ const downloadLink = document.querySelector('a[href$=".exe"]');
+ if (downloadLink) {
+ console.log('Found download link:', downloadLink.href);
+ downloadLink.click();
+ } else {
+ console.log('No .exe download link found');
+ }
+ """,
+ delay_before_return_html=1, # Wait 5 seconds to ensure download starts
+ cache_mode=CacheMode.BYPASS
+ )
+
+ if result.downloaded_files:
+ print("\nDownload successful!")
+ print("Downloaded files:")
+ for file_path in result.downloaded_files:
+ print(f"- {file_path}")
+ print(f" File size: {os.path.getsize(file_path) / (1024*1024):.2f} MB")
+ else:
+ print("\nNo files were downloaded")
+
+# 2. Local File and Raw HTML Processing Example
+async def local_and_raw_html_example():
+ """Example of processing local files and raw HTML"""
+ # Create a sample HTML file
+ sample_file = os.path.join(__data__, "sample.html")
+ with open(sample_file, "w") as f:
+ f.write("""
+
+ Test Content
+ This is a test paragraph.
+
+ """)
+
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ # Process local file
+ local_result = await crawler.arun(
+ url=f"file://{os.path.abspath(sample_file)}"
+ )
+
+ # Process raw HTML
+ raw_html = """
+
+ Raw HTML Test
+ This is a test of raw HTML processing.
+
+ """
+ raw_result = await crawler.arun(
+ url=f"raw:{raw_html}"
+ )
+
+ # Clean up
+ os.remove(sample_file)
+
+ print("Local file content:", local_result.markdown)
+ print("\nRaw HTML content:", raw_result.markdown)
+
+# 3. Enhanced Markdown Generation Example
+async def markdown_generation_example():
+ """Example of enhanced markdown generation with citations and LLM-friendly features"""
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ # Create a content filter (optional)
+ content_filter = BM25ContentFilter(
+ # user_query="History and cultivation",
+ bm25_threshold=1.0
+ )
+
+ result = await crawler.arun(
+ url="https://en.wikipedia.org/wiki/Apple",
+ css_selector="main div#bodyContent",
+ content_filter=content_filter,
+ cache_mode=CacheMode.BYPASS
+ )
+
+ from crawl4ai import AsyncWebCrawler
+ from crawl4ai.content_filter_strategy import BM25ContentFilter
+
+ result = await crawler.arun(
+ url="https://en.wikipedia.org/wiki/Apple",
+ css_selector="main div#bodyContent",
+ content_filter=BM25ContentFilter()
+ )
+ print(result.markdown_v2.fit_markdown)
+
+ print("\nMarkdown Generation Results:")
+ print(f"1. Original markdown length: {len(result.markdown)}")
+ print(f"2. New markdown versions (markdown_v2):")
+ print(f" - Raw markdown length: {len(result.markdown_v2.raw_markdown)}")
+ print(f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}")
+ print(f" - References section length: {len(result.markdown_v2.references_markdown)}")
+ if result.markdown_v2.fit_markdown:
+ print(f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}")
+
+ # Save examples to files
+ output_dir = os.path.join(__data__, "markdown_examples")
+ os.makedirs(output_dir, exist_ok=True)
+
+ # Save different versions
+ with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f:
+ f.write(result.markdown_v2.raw_markdown)
+
+ with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f:
+ f.write(result.markdown_v2.markdown_with_citations)
+
+ with open(os.path.join(output_dir, "3_references.md"), "w") as f:
+ f.write(result.markdown_v2.references_markdown)
+
+ if result.markdown_v2.fit_markdown:
+ with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f:
+ f.write(result.markdown_v2.fit_markdown)
+
+ print(f"\nMarkdown examples saved to: {output_dir}")
+
+ # Show a sample of citations and references
+ print("\nSample of markdown with citations:")
+ print(result.markdown_v2.markdown_with_citations[:500] + "...\n")
+ print("Sample of references:")
+ print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...")
+
+# 4. Browser Management Example
+async def browser_management_example():
+ """Example of using enhanced browser management features"""
+ # Use the specified user directory path
+ user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile")
+ os.makedirs(user_data_dir, exist_ok=True)
+
+ print(f"Browser profile will be saved to: {user_data_dir}")
+
+ async with AsyncWebCrawler(
+ use_managed_browser=True,
+ user_data_dir=user_data_dir,
+ headless=False,
+ verbose=True
+ ) as crawler:
+
+ result = await crawler.arun(
+ url="https://crawl4ai.com",
+ # session_id="persistent_session_1",
+ cache_mode=CacheMode.BYPASS
+ )
+ # Use GitHub as an example - it's a good test for browser management
+ # because it requires proper browser handling
+ result = await crawler.arun(
+ url="https://github.com/trending",
+ # session_id="persistent_session_1",
+ cache_mode=CacheMode.BYPASS
+ )
+
+ print("\nBrowser session result:", result.success)
+ if result.success:
+ print("Page title:", result.metadata.get('title', 'No title found'))
+
+# 5. API Usage Example
+async def api_example():
+ """Example of using the new API endpoints"""
+ api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"
+ headers = {'Authorization': f'Bearer {api_token}'}
+ async with aiohttp.ClientSession() as session:
+ # Submit crawl job
+ crawl_request = {
+ "urls": ["https://news.ycombinator.com"], # Hacker News as an example
+ "extraction_config": {
+ "type": "json_css",
+ "params": {
+ "schema": {
+ "name": "Hacker News Articles",
+ "baseSelector": ".athing",
+ "fields": [
+ {
+ "name": "title",
+ "selector": ".title a",
+ "type": "text"
+ },
+ {
+ "name": "score",
+ "selector": ".score",
+ "type": "text"
+ },
+ {
+ "name": "url",
+ "selector": ".title a",
+ "type": "attribute",
+ "attribute": "href"
+ }
+ ]
+ }
+ }
+ },
+ "crawler_params": {
+ "headless": True,
+ # "use_managed_browser": True
+ },
+ "cache_mode": "bypass",
+ # "screenshot": True,
+ # "magic": True
+ }
+
+ async with session.post(
+ "http://localhost:11235/crawl",
+ json=crawl_request,
+ headers=headers
+ ) as response:
+ task_data = await response.json()
+ task_id = task_data["task_id"]
+
+ # Check task status
+ while True:
+ async with session.get(
+ f"http://localhost:11235/task/{task_id}",
+ headers=headers
+ ) as status_response:
+ result = await status_response.json()
+ print(f"Task status: {result['status']}")
+
+ if result["status"] == "completed":
+ print("Task completed!")
+ print("Results:")
+ news = json.loads(result["results"][0]['extracted_content'])
+ print(json.dumps(news[:4], indent=2))
+ break
+ else:
+ await asyncio.sleep(1)
+
+# Main execution
+async def main():
+ # print("Running Crawl4AI feature examples...")
+
+ # print("\n1. Running Download Example:")
+ # await download_example()
+
+ # print("\n2. Running Markdown Generation Example:")
+ # await markdown_generation_example()
+
+ # # print("\n3. Running Local and Raw HTML Example:")
+ # await local_and_raw_html_example()
+
+ # # print("\n4. Running Browser Management Example:")
+ await browser_management_example()
+
+ # print("\n5. Running API Example:")
+ await api_example()
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/docs/examples/v0_4_24_walkthrough.py b/docs/examples/v0_4_24_walkthrough.py
new file mode 100644
index 0000000000000000000000000000000000000000..135ac29c7ef9f4b75d328f3583675867d66367e0
--- /dev/null
+++ b/docs/examples/v0_4_24_walkthrough.py
@@ -0,0 +1,443 @@
+"""
+Crawl4AI v0.4.24 Feature Walkthrough
+===================================
+
+This script demonstrates the new features introduced in Crawl4AI v0.4.24.
+Each section includes detailed examples and explanations of the new capabilities.
+"""
+
+import asyncio
+import os
+import json
+import re
+from typing import List, Optional, Dict, Any
+from pydantic import BaseModel, Field
+from crawl4ai import (
+ AsyncWebCrawler,
+ BrowserConfig,
+ CrawlerRunConfig,
+ CacheMode,
+ LLMExtractionStrategy,
+ JsonCssExtractionStrategy
+)
+from crawl4ai.content_filter_strategy import RelevantContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from bs4 import BeautifulSoup
+
+# Sample HTML for demonstrations
+SAMPLE_HTML = """
+
+
+
+
+
+
First post content...
+
Read More
+
+
+
+
+
+
+
Second post content...
+
Read More
+
+
+
+"""
+
+async def demo_ssl_features():
+ """
+ Enhanced SSL & Security Features Demo
+ -----------------------------------
+
+ This example demonstrates the new SSL certificate handling and security features:
+ 1. Custom certificate paths
+ 2. SSL verification options
+ 3. HTTPS error handling
+ 4. Certificate validation configurations
+
+ These features are particularly useful when:
+ - Working with self-signed certificates
+ - Dealing with corporate proxies
+ - Handling mixed content websites
+ - Managing different SSL security levels
+ """
+ print("\n1. Enhanced SSL & Security Demo")
+ print("--------------------------------")
+
+ browser_config = BrowserConfig()
+
+ run_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ fetch_ssl_certificate=True # Enable SSL certificate fetching
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ url="https://example.com",
+ config=run_config
+ )
+ print(f"SSL Crawl Success: {result.success}")
+ result.ssl_certificate.to_json(
+ os.path.join(os.getcwd(), "ssl_certificate.json")
+ )
+ if not result.success:
+ print(f"SSL Error: {result.error_message}")
+
+async def demo_content_filtering():
+ """
+ Smart Content Filtering Demo
+ ----------------------
+
+ Demonstrates advanced content filtering capabilities:
+ 1. Custom filter to identify and extract specific content
+ 2. Integration with markdown generation
+ 3. Flexible pruning rules
+ """
+ print("\n2. Smart Content Filtering Demo")
+ print("--------------------------------")
+
+ # Create a custom content filter
+ class CustomNewsFilter(RelevantContentFilter):
+ def __init__(self):
+ super().__init__()
+ # Add news-specific patterns
+ self.negative_patterns = re.compile(
+ r'nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending',
+ re.I
+ )
+ self.min_word_count = 30 # Higher threshold for news content
+
+ def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
+ """
+ Implements news-specific content filtering logic.
+
+ Args:
+ html (str): HTML content to be filtered
+ min_word_threshold (int, optional): Minimum word count threshold
+
+ Returns:
+ List[str]: List of filtered HTML content blocks
+ """
+ if not html or not isinstance(html, str):
+ return []
+
+ soup = BeautifulSoup(html, 'lxml')
+ if not soup.body:
+ soup = BeautifulSoup(f'{html}', 'lxml')
+
+ body = soup.find('body')
+
+ # Extract chunks with metadata
+ chunks = self.extract_text_chunks(body, min_word_threshold or self.min_word_count)
+
+ # Filter chunks based on news-specific criteria
+ filtered_chunks = []
+ for _, text, tag_type, element in chunks:
+ # Skip if element has negative class/id
+ if self.is_excluded(element):
+ continue
+
+ # Headers are important in news articles
+ if tag_type == 'header':
+ filtered_chunks.append(self.clean_element(element))
+ continue
+
+ # For content, check word count and link density
+ text = element.get_text(strip=True)
+ if len(text.split()) >= (min_word_threshold or self.min_word_count):
+ # Calculate link density
+ links_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a'))
+ link_density = len(links_text) / len(text) if text else 1
+
+ # Accept if link density is reasonable
+ if link_density < 0.5:
+ filtered_chunks.append(self.clean_element(element))
+
+ return filtered_chunks
+
+ # Create markdown generator with custom filter
+ markdown_gen = DefaultMarkdownGenerator(
+ content_filter=CustomNewsFilter()
+ )
+
+ run_config = CrawlerRunConfig(
+ markdown_generator=markdown_gen,
+ cache_mode=CacheMode.BYPASS
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="https://news.ycombinator.com",
+ config=run_config
+ )
+ print("Filtered Content Sample:")
+ print(result.markdown[:500]) # Show first 500 chars
+
+async def demo_json_extraction():
+ """
+ Improved JSON Extraction Demo
+ ---------------------------
+
+ Demonstrates the enhanced JSON extraction capabilities:
+ 1. Base element attributes extraction
+ 2. Complex nested structures
+ 3. Multiple extraction patterns
+
+ Key features shown:
+ - Extracting attributes from base elements (href, data-* attributes)
+ - Processing repeated patterns
+ - Handling optional fields
+ """
+ print("\n3. Improved JSON Extraction Demo")
+ print("--------------------------------")
+
+ # Define the extraction schema with base element attributes
+ json_strategy = JsonCssExtractionStrategy(
+ schema={
+ "name": "Blog Posts",
+ "baseSelector": "div.article-list",
+ "baseFields": [
+ {"name": "list_id", "type": "attribute", "attribute": "data-list-id"},
+ {"name": "category", "type": "attribute", "attribute": "data-category"}
+ ],
+ "fields": [
+ {
+ "name": "posts",
+ "selector": "article.post",
+ "type": "nested_list",
+ "baseFields": [
+ {"name": "post_id", "type": "attribute", "attribute": "data-post-id"},
+ {"name": "author_id", "type": "attribute", "attribute": "data-author"}
+ ],
+ "fields": [
+ {
+ "name": "title",
+ "selector": "h2.title a",
+ "type": "text",
+ "baseFields": [
+ {"name": "url", "type": "attribute", "attribute": "href"}
+ ]
+ },
+ {
+ "name": "author",
+ "selector": "div.meta a.author",
+ "type": "text",
+ "baseFields": [
+ {"name": "profile_url", "type": "attribute", "attribute": "href"}
+ ]
+ },
+ {
+ "name": "date",
+ "selector": "span.date",
+ "type": "text"
+ },
+ {
+ "name": "read_more",
+ "selector": "a.read-more",
+ "type": "nested",
+ "fields": [
+ {"name": "text", "type": "text"},
+ {"name": "url", "type": "attribute", "attribute": "href"}
+ ]
+ }
+ ]
+ }
+ ]
+ }
+ )
+
+ # Demonstrate extraction from raw HTML
+ run_config = CrawlerRunConfig(
+ extraction_strategy=json_strategy,
+ cache_mode=CacheMode.BYPASS
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="raw:" + SAMPLE_HTML, # Use raw: prefix for raw HTML
+ config=run_config
+ )
+ print("Extracted Content:")
+ print(result.extracted_content)
+
+async def demo_input_formats():
+ """
+ Input Format Handling Demo
+ ----------------------
+
+ Demonstrates how LLM extraction can work with different input formats:
+ 1. Markdown (default) - Good for simple text extraction
+ 2. HTML - Better when you need structure and attributes
+
+ This example shows how HTML input can be beneficial when:
+ - You need to understand the DOM structure
+ - You want to extract both visible text and HTML attributes
+ - The content has complex layouts like tables or forms
+ """
+ print("\n4. Input Format Handling Demo")
+ print("---------------------------")
+
+ # Create a dummy HTML with rich structure
+ dummy_html = """
+
+
+
+
+
+
Technical Requirements
+
+
+ 5+ years experience in Machine Learning
+
+
+ Proficiency in Python and PyTorch/TensorFlow
+
+
+ Experience with distributed training systems
+
+
+
+
+
+
Professional Skills
+
+
+ Strong problem-solving abilities
+
+
+ Experience leading technical teams
+
+
+
+
+
+
+
+ Application Deadline: February 28, 2024
+
+
+
+
+
+ """
+
+ # Use raw:// prefix to pass HTML content directly
+ url = f"raw://{dummy_html}"
+
+ from pydantic import BaseModel, Field
+ from typing import List, Optional
+
+ # Define our schema using Pydantic
+ class JobRequirement(BaseModel):
+ category: str = Field(description="Category of the requirement (e.g., Technical, Soft Skills)")
+ items: List[str] = Field(description="List of specific requirements in this category")
+ priority: str = Field(description="Priority level (Required/Preferred) based on the HTML class or context")
+
+ class JobPosting(BaseModel):
+ title: str = Field(description="Job title")
+ department: str = Field(description="Department or team")
+ location: str = Field(description="Job location, including remote options")
+ salary_range: Optional[str] = Field(description="Salary range if specified")
+ requirements: List[JobRequirement] = Field(description="Categorized job requirements")
+ application_deadline: Optional[str] = Field(description="Application deadline if specified")
+ contact_info: Optional[dict] = Field(description="Contact information from footer or contact section")
+
+ # First try with markdown (default)
+ markdown_strategy = LLMExtractionStrategy(
+ provider="openai/gpt-4o",
+ api_token=os.getenv("OPENAI_API_KEY"),
+ schema=JobPosting.model_json_schema(),
+ extraction_type="schema",
+ instruction="""
+ Extract job posting details into structured data. Focus on the visible text content
+ and organize requirements into categories.
+ """,
+ input_format="markdown" # default
+ )
+
+ # Then with HTML for better structure understanding
+ html_strategy = LLMExtractionStrategy(
+ provider="openai/gpt-4",
+ api_token=os.getenv("OPENAI_API_KEY"),
+ schema=JobPosting.model_json_schema(),
+ extraction_type="schema",
+ instruction="""
+ Extract job posting details, using HTML structure to:
+ 1. Identify requirement priorities from CSS classes (e.g., 'required' vs 'preferred')
+ 2. Extract contact info from the page footer or dedicated contact section
+ 3. Parse salary information from specially formatted elements
+ 4. Determine application deadline from timestamp or date elements
+
+ Use HTML attributes and classes to enhance extraction accuracy.
+ """,
+ input_format="html" # explicitly use HTML
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ # Try with markdown first
+ markdown_config = CrawlerRunConfig(
+ extraction_strategy=markdown_strategy
+ )
+ markdown_result = await crawler.arun(
+ url=url,
+ config=markdown_config
+ )
+ print("\nMarkdown-based Extraction Result:")
+ items = json.loads(markdown_result.extracted_content)
+ print(json.dumps(items, indent=2))
+
+ # Then with HTML for better structure understanding
+ html_config = CrawlerRunConfig(
+ extraction_strategy=html_strategy
+ )
+ html_result = await crawler.arun(
+ url=url,
+ config=html_config
+ )
+ print("\nHTML-based Extraction Result:")
+ items = json.loads(html_result.extracted_content)
+ print(json.dumps(items, indent=2))
+
+# Main execution
+async def main():
+ print("Crawl4AI v0.4.24 Feature Walkthrough")
+ print("====================================")
+
+ # Run all demos
+ await demo_ssl_features()
+ await demo_content_filtering()
+ await demo_json_extraction()
+ # await demo_input_formats()
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/docs/md_v2/advanced/content-processing.md b/docs/md_v2/advanced/content-processing.md
new file mode 100644
index 0000000000000000000000000000000000000000..25ed6172f2451dcce4898c1eeb0cb923ed07defd
--- /dev/null
+++ b/docs/md_v2/advanced/content-processing.md
@@ -0,0 +1,136 @@
+# Content Processing
+
+Crawl4AI provides powerful content processing capabilities that help you extract clean, relevant content from web pages. This guide covers content cleaning, media handling, link analysis, and metadata extraction.
+
+## Media Processing
+
+Crawl4AI provides comprehensive media extraction and analysis capabilities. It automatically detects and processes various types of media elements while maintaining their context and relevance.
+
+### Image Processing
+
+The library handles various image scenarios, including:
+- Regular images
+- Lazy-loaded images
+- Background images
+- Responsive images
+- Image metadata and context
+
+```python
+from crawl4ai.async_configs import CrawlerRunConfig
+
+config = CrawlerRunConfig()
+result = await crawler.arun(url="https://example.com", config=config)
+
+for image in result.media["images"]:
+ # Each image includes rich metadata
+ print(f"Source: {image['src']}")
+ print(f"Alt text: {image['alt']}")
+ print(f"Description: {image['desc']}")
+ print(f"Context: {image['context']}") # Surrounding text
+ print(f"Relevance score: {image['score']}") # 0-10 score
+```
+
+### Handling Lazy-Loaded Content
+
+Crawl4AI already handles lazy loading for media elements. You can customize the wait time for lazy-loaded content with `CrawlerRunConfig`:
+
+```python
+config = CrawlerRunConfig(
+ wait_for="css:img[data-src]", # Wait for lazy images
+ delay_before_return_html=2.0 # Additional wait time
+)
+result = await crawler.arun(url="https://example.com", config=config)
+```
+
+### Video and Audio Content
+
+The library extracts video and audio elements with their metadata:
+
+```python
+from crawl4ai.async_configs import CrawlerRunConfig
+
+config = CrawlerRunConfig()
+result = await crawler.arun(url="https://example.com", config=config)
+
+# Process videos
+for video in result.media["videos"]:
+ print(f"Video source: {video['src']}")
+ print(f"Type: {video['type']}")
+ print(f"Duration: {video.get('duration')}")
+ print(f"Thumbnail: {video.get('poster')}")
+
+# Process audio
+for audio in result.media["audios"]:
+ print(f"Audio source: {audio['src']}")
+ print(f"Type: {audio['type']}")
+ print(f"Duration: {audio.get('duration')}")
+```
+
+## Link Analysis
+
+Crawl4AI provides sophisticated link analysis capabilities, helping you understand the relationship between pages and identify important navigation patterns.
+
+### Link Classification
+
+The library automatically categorizes links into:
+- Internal links (same domain)
+- External links (different domains)
+- Social media links
+- Navigation links
+- Content links
+
+```python
+from crawl4ai.async_configs import CrawlerRunConfig
+
+config = CrawlerRunConfig()
+result = await crawler.arun(url="https://example.com", config=config)
+
+# Analyze internal links
+for link in result.links["internal"]:
+ print(f"Internal: {link['href']}")
+ print(f"Link text: {link['text']}")
+ print(f"Context: {link['context']}") # Surrounding text
+ print(f"Type: {link['type']}") # nav, content, etc.
+
+# Analyze external links
+for link in result.links["external"]:
+ print(f"External: {link['href']}")
+ print(f"Domain: {link['domain']}")
+ print(f"Type: {link['type']}")
+```
+
+### Smart Link Filtering
+
+Control which links are included in the results with `CrawlerRunConfig`:
+
+```python
+config = CrawlerRunConfig(
+ exclude_external_links=True, # Remove external links
+ exclude_social_media_links=True, # Remove social media links
+ exclude_social_media_domains=[ # Custom social media domains
+ "facebook.com", "twitter.com", "instagram.com"
+ ],
+ exclude_domains=["ads.example.com"] # Exclude specific domains
+)
+result = await crawler.arun(url="https://example.com", config=config)
+```
+
+## Metadata Extraction
+
+Crawl4AI automatically extracts and processes page metadata, providing valuable information about the content:
+
+```python
+from crawl4ai.async_configs import CrawlerRunConfig
+
+config = CrawlerRunConfig()
+result = await crawler.arun(url="https://example.com", config=config)
+
+metadata = result.metadata
+print(f"Title: {metadata['title']}")
+print(f"Description: {metadata['description']}")
+print(f"Keywords: {metadata['keywords']}")
+print(f"Author: {metadata['author']}")
+print(f"Published Date: {metadata['published_date']}")
+print(f"Modified Date: {metadata['modified_date']}")
+print(f"Language: {metadata['language']}")
+```
diff --git a/docs/md_v2/advanced/hooks-auth.md b/docs/md_v2/advanced/hooks-auth.md
new file mode 100644
index 0000000000000000000000000000000000000000..6604222910df5d77cfbf99deaca8c54cd299e9d3
--- /dev/null
+++ b/docs/md_v2/advanced/hooks-auth.md
@@ -0,0 +1,121 @@
+# Hooks & Auth for AsyncWebCrawler
+
+Crawl4AI's `AsyncWebCrawler` allows you to customize the behavior of the web crawler using hooks. Hooks are asynchronous functions called at specific points in the crawling process, allowing you to modify the crawler's behavior or perform additional actions. This updated documentation demonstrates how to use hooks, including the new `on_page_context_created` hook, and ensures compatibility with `BrowserConfig` and `CrawlerRunConfig`.
+
+## Example: Using Crawler Hooks with AsyncWebCrawler
+
+In this example, we'll:
+
+1. Configure the browser and set up authentication when it's created.
+2. Apply custom routing and initial actions when the page context is created.
+3. Add custom headers before navigating to the URL.
+4. Log the current URL after navigation.
+5. Perform actions after JavaScript execution.
+6. Log the length of the HTML before returning it.
+
+### Hook Definitions
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from playwright.async_api import Page, Browser, BrowserContext
+
+def log_routing(route):
+ # Example: block loading images
+ if route.request.resource_type == "image":
+ print(f"[HOOK] Blocking image request: {route.request.url}")
+ asyncio.create_task(route.abort())
+ else:
+ asyncio.create_task(route.continue_())
+
+async def on_browser_created(browser: Browser, **kwargs):
+ print("[HOOK] on_browser_created")
+ # Example: Set browser viewport size and log in
+ context = await browser.new_context(viewport={"width": 1920, "height": 1080})
+ page = await context.new_page()
+ await page.goto("https://example.com/login")
+ await page.fill("input[name='username']", "testuser")
+ await page.fill("input[name='password']", "password123")
+ await page.click("button[type='submit']")
+ await page.wait_for_selector("#welcome")
+ await context.add_cookies([{"name": "auth_token", "value": "abc123", "url": "https://example.com"}])
+ await page.close()
+ await context.close()
+
+async def on_page_context_created(context: BrowserContext, page: Page, **kwargs):
+ print("[HOOK] on_page_context_created")
+ await context.route("**", log_routing)
+
+async def before_goto(page: Page, context: BrowserContext, **kwargs):
+ print("[HOOK] before_goto")
+ await page.set_extra_http_headers({"X-Test-Header": "test"})
+
+async def after_goto(page: Page, context: BrowserContext, **kwargs):
+ print("[HOOK] after_goto")
+ print(f"Current URL: {page.url}")
+
+async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
+ print("[HOOK] on_execution_started")
+ await page.evaluate("console.log('Custom JS executed')")
+
+async def before_return_html(page: Page, context: BrowserContext, html: str, **kwargs):
+ print("[HOOK] before_return_html")
+ print(f"HTML length: {len(html)}")
+ return page
+```
+
+### Using the Hooks with AsyncWebCrawler
+
+```python
+async def main():
+ print("\n🔗 Using Crawler Hooks: Customize AsyncWebCrawler with hooks!")
+
+ # Configure browser and crawler settings
+ browser_config = BrowserConfig(
+ headless=True,
+ viewport_width=1920,
+ viewport_height=1080
+ )
+
+ crawler_run_config = CrawlerRunConfig(
+ js_code="window.scrollTo(0, document.body.scrollHeight);",
+ wait_for="footer"
+ )
+
+ # Initialize crawler
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
+ crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
+ crawler.crawler_strategy.set_hook("before_goto", before_goto)
+ crawler.crawler_strategy.set_hook("after_goto", after_goto)
+ crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+ crawler.crawler_strategy.set_hook("before_return_html", before_return_html)
+
+ # Run the crawler
+ result = await crawler.arun(url="https://example.com", config=crawler_run_config)
+
+ print("\n📦 Crawler Hooks Result:")
+ print(result)
+
+asyncio.run(main())
+```
+
+### Explanation of Hooks
+
+- **`on_browser_created`**: Called when the browser is created. Use this to configure the browser or handle authentication (e.g., logging in and setting cookies).
+- **`on_page_context_created`**: Called when a new page context is created. Use this to apply routing, block resources, or inject custom logic before navigating to the URL.
+- **`before_goto`**: Called before navigating to the URL. Use this to add custom headers or perform other pre-navigation actions.
+- **`after_goto`**: Called after navigation. Use this to verify content or log the URL.
+- **`on_execution_started`**: Called after executing custom JavaScript. Use this to perform additional actions.
+- **`before_return_html`**: Called before returning the HTML content. Use this to log details or preprocess the content.
+
+### Additional Customizations
+
+- **Resource Management**: Use `on_page_context_created` to block or modify requests (e.g., block images, fonts, or third-party scripts).
+- **Dynamic Headers**: Use `before_goto` to add or modify headers dynamically based on the URL.
+- **Authentication**: Use `on_browser_created` to handle login processes and set authentication cookies or tokens.
+- **Content Analysis**: Use `before_return_html` to analyze or modify the extracted HTML content.
+
+These hooks provide powerful customization options for tailoring the crawling process to your needs.
+
diff --git a/docs/md_v2/advanced/identity_based_crawling.md b/docs/md_v2/advanced/identity_based_crawling.md
new file mode 100644
index 0000000000000000000000000000000000000000..c0ab7fd599d13c62c9c4c4a64ca6d5499966dde4
--- /dev/null
+++ b/docs/md_v2/advanced/identity_based_crawling.md
@@ -0,0 +1,156 @@
+### Preserve Your Identity with Crawl4AI
+
+Crawl4AI empowers you to navigate and interact with the web using your authentic digital identity, ensuring that you are recognized as a human and not mistaken for a bot. This document introduces Managed Browsers, the recommended approach for preserving your rights to access the web, and Magic Mode, a simplified solution for specific scenarios.
+
+---
+
+### Managed Browsers: Your Digital Identity Solution
+
+**Managed Browsers** enable developers to create and use persistent browser profiles. These profiles store local storage, cookies, and other session-related data, allowing you to interact with websites as a recognized user. By leveraging your unique identity, Managed Browsers ensure that your experience reflects your rights as a human browsing the web.
+
+#### Why Use Managed Browsers?
+1. **Authentic Browsing Experience**: Managed Browsers retain session data and browser fingerprints, mirroring genuine user behavior.
+2. **Effortless Configuration**: Once you interact with the site using the browser (e.g., solving a CAPTCHA), the session data is saved and reused, providing seamless access.
+3. **Empowered Data Access**: By using your identity, Managed Browsers empower users to access data they can view on their own screens without artificial restrictions.
+
+#### Steps to Use Managed Browsers
+
+1. **Setup the Browser Configuration**:
+ ```python
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+ from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+ browser_config = BrowserConfig(
+ headless=False, # Set to False for initial setup to view browser actions
+ verbose=True,
+ user_agent_mode="random",
+ use_managed_browser=True, # Enables persistent browser sessions
+ browser_type="chromium",
+ user_data_dir="/path/to/user_profile_data" # Path to save session data
+ )
+ ```
+
+2. **Perform an Initial Run**:
+ - Run the crawler with `headless=False`.
+ - Manually interact with the site (e.g., solve CAPTCHA or log in).
+ - The browser session saves cookies, local storage, and other required data.
+
+3. **Subsequent Runs**:
+ - Switch to `headless=True` for automation.
+ - The session data is reused, allowing seamless crawling.
+
+#### Example: Extracting Data Using Managed Browsers
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+ # Define schema for structured data extraction
+ schema = {
+ "name": "Example Data",
+ "baseSelector": "div.example",
+ "fields": [
+ {"name": "title", "selector": "h1", "type": "text"},
+ {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
+ ]
+ }
+
+ # Configure crawler
+ browser_config = BrowserConfig(
+ headless=True, # Automate subsequent runs
+ verbose=True,
+ use_managed_browser=True,
+ user_data_dir="/path/to/user_profile_data"
+ )
+
+ crawl_config = CrawlerRunConfig(
+ extraction_strategy=JsonCssExtractionStrategy(schema),
+ wait_for="css:div.example" # Wait for the targeted element to load
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ url="https://example.com",
+ config=crawl_config
+ )
+
+ if result.success:
+ print("Extracted Data:", result.extracted_content)
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+### Benefits of Managed Browsers Over Other Methods
+Managed Browsers eliminate the need for manual detection workarounds by enabling developers to work directly with their identity and user profile data. This approach ensures maximum compatibility with websites and simplifies the crawling process while preserving your right to access data freely.
+
+---
+
+### Magic Mode: Simplified Automation
+
+While Managed Browsers are the preferred approach, **Magic Mode** provides an alternative for scenarios where persistent user profiles are unnecessary or infeasible. Magic Mode automates user-like behavior and simplifies configuration.
+
+#### What Magic Mode Does:
+- Simulates human browsing by randomizing interaction patterns and timing.
+- Masks browser automation signals.
+- Handles cookie popups and modals.
+- Modifies navigator properties for enhanced compatibility.
+
+#### Using Magic Mode
+
+```python
+async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="https://example.com",
+ magic=True # Enables all automation features
+ )
+```
+
+Magic Mode is particularly useful for:
+- Quick prototyping when a Managed Browser setup is not available.
+- Basic sites requiring minimal interaction or configuration.
+
+#### Example: Combining Magic Mode with Additional Options
+
+```python
+async def crawl_with_magic_mode(url: str):
+ async with AsyncWebCrawler(headless=True) as crawler:
+ result = await crawler.arun(
+ url=url,
+ magic=True,
+ remove_overlay_elements=True, # Remove popups/modals
+ page_timeout=60000 # Increased timeout for complex pages
+ )
+
+ return result.markdown if result.success else None
+```
+
+### Magic Mode vs. Managed Browsers
+While Magic Mode simplifies many tasks, it cannot match the reliability and authenticity of Managed Browsers. By using your identity and persistent profiles, Managed Browsers render Magic Mode largely unnecessary. However, Magic Mode remains a viable fallback for specific situations where user identity is not a factor.
+
+---
+
+### Key Comparison: Managed Browsers vs. Magic Mode
+
+| Feature | **Managed Browsers** | **Magic Mode** |
+|-------------------------|------------------------------------------|-------------------------------------|
+| **Session Persistence** | Retains cookies and local storage. | No session retention. |
+| **Human Interaction** | Uses real user profiles and data. | Simulates human-like patterns. |
+| **Complex Sites** | Best suited for heavily configured sites.| Works well with simpler challenges.|
+| **Setup Complexity** | Requires initial manual interaction. | Fully automated, one-line setup. |
+
+#### Recommendation:
+- Use **Managed Browsers** for reliable, session-based crawling and data extraction.
+- Use **Magic Mode** for quick prototyping or when persistent profiles are not required.
+
+---
+
+### Conclusion
+
+- **Use Managed Browsers** to preserve your digital identity and ensure reliable, identity-based crawling with persistent sessions. This approach works seamlessly for even the most complex websites.
+- **Leverage Magic Mode** for quick automation or in scenarios where persistent user profiles are not needed.
+
+By combining these approaches, Crawl4AI provides unparalleled flexibility and capability for your crawling needs.
+
diff --git a/docs/md_v2/advanced/magic-mode.md b/docs/md_v2/advanced/magic-mode.md
new file mode 100644
index 0000000000000000000000000000000000000000..16c7229e787deb263af917067fefbc9d89142f5c
--- /dev/null
+++ b/docs/md_v2/advanced/magic-mode.md
@@ -0,0 +1,52 @@
+# Magic Mode & Anti-Bot Protection
+
+Crawl4AI provides powerful anti-detection capabilities, with Magic Mode being the simplest and most comprehensive solution.
+
+## Magic Mode
+
+The easiest way to bypass anti-bot protections:
+
+```python
+async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="https://example.com",
+ magic=True # Enables all anti-detection features
+ )
+```
+
+Magic Mode automatically:
+- Masks browser automation signals
+- Simulates human-like behavior
+- Overrides navigator properties
+- Handles cookie consent popups
+- Manages browser fingerprinting
+- Randomizes timing patterns
+
+## Manual Anti-Bot Options
+
+While Magic Mode is recommended, you can also configure individual anti-detection features:
+
+```python
+result = await crawler.arun(
+ url="https://example.com",
+ simulate_user=True, # Simulate human behavior
+ override_navigator=True # Mask automation signals
+)
+```
+
+Note: When `magic=True` is used, you don't need to set these individual options.
+
+## Example: Handling Protected Sites
+
+```python
+async def crawl_protected_site(url: str):
+ async with AsyncWebCrawler(headless=True) as crawler:
+ result = await crawler.arun(
+ url=url,
+ magic=True,
+ remove_overlay_elements=True, # Remove popups/modals
+ page_timeout=60000 # Increased timeout for protection checks
+ )
+
+ return result.markdown if result.success else None
+```
diff --git a/docs/md_v2/advanced/managed_browser.md b/docs/md_v2/advanced/managed_browser.md
new file mode 100644
index 0000000000000000000000000000000000000000..bbe07f2f8deff46e1722a3b002ded08b046a9a6c
--- /dev/null
+++ b/docs/md_v2/advanced/managed_browser.md
@@ -0,0 +1,188 @@
+# Creating Browser Instances, Contexts, and Pages
+
+## 1 Introduction
+
+### Overview of Browser Management in Crawl4AI
+Crawl4AI's browser management system is designed to provide developers with advanced tools for handling complex web crawling tasks. By managing browser instances, contexts, and pages, Crawl4AI ensures optimal performance, anti-bot measures, and session persistence for high-volume, dynamic web crawling.
+
+### Key Objectives
+- **Anti-Bot Handling**:
+ - Implements stealth techniques to evade detection mechanisms used by modern websites.
+ - Simulates human-like behavior, such as mouse movements, scrolling, and key presses.
+ - Supports integration with third-party services to bypass CAPTCHA challenges.
+- **Persistent Sessions**:
+ - Retains session data (cookies, local storage) for workflows requiring user authentication.
+ - Allows seamless continuation of tasks across multiple runs without re-authentication.
+- **Scalable Crawling**:
+ - Optimized resource utilization for handling thousands of URLs concurrently.
+ - Flexible configuration options to tailor crawling behavior to specific requirements.
+
+---
+
+## 2 Browser Creation Methods
+
+### Standard Browser Creation
+Standard browser creation initializes a browser instance with default or minimal configurations. It is suitable for tasks that do not require session persistence or heavy customization.
+
+#### Features and Limitations
+- **Features**:
+ - Quick and straightforward setup for small-scale tasks.
+ - Supports headless and headful modes.
+- **Limitations**:
+ - Lacks advanced customization options like session reuse.
+ - May struggle with sites employing strict anti-bot measures.
+
+#### Example Usage
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig
+
+browser_config = BrowserConfig(browser_type="chromium", headless=True)
+async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun("https://crawl4ai.com")
+ print(result.markdown)
+```
+
+### Persistent Contexts
+Persistent contexts create browser sessions with stored data, enabling workflows that require maintaining login states or other session-specific information.
+
+#### Benefits of Using `user_data_dir`
+- **Session Persistence**:
+ - Stores cookies, local storage, and cache between crawling sessions.
+ - Reduces overhead for repetitive logins or multi-step workflows.
+- **Enhanced Performance**:
+ - Leverages pre-loaded resources for faster page loading.
+- **Flexibility**:
+ - Adapts to complex workflows requiring user-specific configurations.
+
+#### Example: Setting Up Persistent Contexts
+```python
+config = BrowserConfig(user_data_dir="/path/to/user/data")
+async with AsyncWebCrawler(config=config) as crawler:
+ result = await crawler.arun("https://crawl4ai.com")
+ print(result.markdown)
+```
+
+### Managed Browser
+The `ManagedBrowser` class offers a high-level abstraction for managing browser instances, emphasizing resource management, debugging capabilities, and anti-bot measures.
+
+#### How It Works
+- **Browser Process Management**:
+ - Automates initialization and cleanup of browser processes.
+ - Optimizes resource usage by pooling and reusing browser instances.
+- **Debugging Support**:
+ - Integrates with debugging tools like Chrome Developer Tools for real-time inspection.
+- **Anti-Bot Measures**:
+ - Implements stealth plugins to mimic real user behavior and bypass bot detection.
+
+#### Features
+- **Customizable Configurations**:
+ - Supports advanced options such as viewport resizing, proxy settings, and header manipulation.
+- **Debugging and Logging**:
+ - Logs detailed browser interactions for debugging and performance analysis.
+- **Scalability**:
+ - Handles multiple browser instances concurrently, scaling dynamically based on workload.
+
+#### Example: Using `ManagedBrowser`
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig
+
+config = BrowserConfig(headless=False, debug_port=9222)
+async with AsyncWebCrawler(config=config) as crawler:
+ result = await crawler.arun("https://crawl4ai.com")
+ print(result.markdown)
+```
+
+---
+
+## 3 Context and Page Management
+
+### Creating and Configuring Browser Contexts
+Browser contexts act as isolated environments within a single browser instance, enabling independent browsing sessions with their own cookies, cache, and storage.
+
+#### Customizations
+- **Headers and Cookies**:
+ - Define custom headers to mimic specific devices or browsers.
+ - Set cookies for authenticated sessions.
+- **Session Reuse**:
+ - Retain and reuse session data across multiple requests.
+ - Example: Preserve login states for authenticated crawls.
+
+#### Example: Context Initialization
+```python
+from crawl4ai import CrawlerRunConfig
+
+config = CrawlerRunConfig(headers={"User-Agent": "Crawl4AI/1.0"})
+async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://crawl4ai.com", config=config)
+ print(result.markdown)
+```
+
+### Creating Pages
+Pages represent individual tabs or views within a browser context. They are responsible for rendering content, executing JavaScript, and handling user interactions.
+
+#### Key Features
+- **IFrame Handling**:
+ - Extract content from embedded iframes.
+ - Navigate and interact with nested content.
+- **Viewport Customization**:
+ - Adjust viewport size to match target device dimensions.
+- **Lazy Loading**:
+ - Ensure dynamic elements are fully loaded before extraction.
+
+#### Example: Page Initialization
+```python
+config = CrawlerRunConfig(viewport_width=1920, viewport_height=1080)
+async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://crawl4ai.com", config=config)
+ print(result.markdown)
+```
+
+---
+
+## 4 Advanced Features and Best Practices
+
+### Debugging and Logging
+Remote debugging provides a powerful way to troubleshoot complex crawling workflows.
+
+#### Example: Enabling Remote Debugging
+```python
+config = BrowserConfig(debug_port=9222)
+async with AsyncWebCrawler(config=config) as crawler:
+ result = await crawler.arun("https://crawl4ai.com")
+```
+
+### Anti-Bot Techniques
+- **Human Behavior Simulation**:
+ - Mimic real user actions, such as scrolling, clicking, and typing.
+ - Example: Use JavaScript to simulate interactions.
+- **Captcha Handling**:
+ - Integrate with third-party services like 2Captcha or AntiCaptcha for automated solving.
+
+#### Example: Simulating User Actions
+```python
+js_code = """
+(async () => {
+ document.querySelector('input[name="search"]').value = 'test';
+ document.querySelector('button[type="submit"]').click();
+})();
+"""
+config = CrawlerRunConfig(js_code=[js_code])
+async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://crawl4ai.com", config=config)
+```
+
+### Optimizations for Performance and Scalability
+- **Persistent Contexts**:
+ - Reuse browser contexts to minimize resource consumption.
+- **Concurrent Crawls**:
+ - Use `arun_many` with a controlled semaphore count for efficient batch processing.
+
+#### Example: Scaling Crawls
+```python
+urls = ["https://example1.com", "https://example2.com"]
+config = CrawlerRunConfig(semaphore_count=10)
+async with AsyncWebCrawler() as crawler:
+ results = await crawler.arun_many(urls, config=config)
+ for result in results:
+ print(result.url, result.markdown)
+```
diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md
new file mode 100644
index 0000000000000000000000000000000000000000..8989777b0b07e40a8c3c744dd53dd89919c57b6b
--- /dev/null
+++ b/docs/md_v2/advanced/proxy-security.md
@@ -0,0 +1,95 @@
+# Proxy & Security
+
+Configure proxy settings and enhance security features in Crawl4AI for reliable data extraction.
+
+## Basic Proxy Setup
+
+Simple proxy configuration with `BrowserConfig`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+
+# Using proxy URL
+browser_config = BrowserConfig(proxy="http://proxy.example.com:8080")
+async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com")
+
+# Using SOCKS proxy
+browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080")
+async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com")
+```
+
+## Authenticated Proxy
+
+Use an authenticated proxy with `BrowserConfig`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+
+proxy_config = {
+ "server": "http://proxy.example.com:8080",
+ "username": "user",
+ "password": "pass"
+}
+
+browser_config = BrowserConfig(proxy_config=proxy_config)
+async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com")
+```
+
+## Rotating Proxies
+
+Example using a proxy rotation service and updating `BrowserConfig` dynamically:
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+
+async def get_next_proxy():
+ # Your proxy rotation logic here
+ return {"server": "http://next.proxy.com:8080"}
+
+browser_config = BrowserConfig()
+async with AsyncWebCrawler(config=browser_config) as crawler:
+ # Update proxy for each request
+ for url in urls:
+ proxy = await get_next_proxy()
+ browser_config.proxy_config = proxy
+ result = await crawler.arun(url=url, config=browser_config)
+```
+
+## Custom Headers
+
+Add security-related headers via `BrowserConfig`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+
+headers = {
+ "X-Forwarded-For": "203.0.113.195",
+ "Accept-Language": "en-US,en;q=0.9",
+ "Cache-Control": "no-cache",
+ "Pragma": "no-cache"
+}
+
+browser_config = BrowserConfig(headers=headers)
+async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com")
+```
+
+## Combining with Magic Mode
+
+For maximum protection, combine proxy with Magic Mode via `CrawlerRunConfig` and `BrowserConfig`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+
+browser_config = BrowserConfig(
+ proxy="http://proxy.example.com:8080",
+ headers={"Accept-Language": "en-US"}
+)
+crawler_config = CrawlerRunConfig(magic=True) # Enable all anti-detection features
+
+async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com", config=crawler_config)
+```
diff --git a/docs/md_v2/advanced/session-management-advanced.md b/docs/md_v2/advanced/session-management-advanced.md
new file mode 100644
index 0000000000000000000000000000000000000000..ba1ae0a0bfc0ef52f82eb6e915496b31c4b3b1b5
--- /dev/null
+++ b/docs/md_v2/advanced/session-management-advanced.md
@@ -0,0 +1,179 @@
+### Session-Based Crawling for Dynamic Content
+
+In modern web applications, content is often loaded dynamically without changing the URL. Examples include "Load More" buttons, infinite scrolling, or paginated content that updates via JavaScript. Crawl4AI provides session-based crawling capabilities to handle such scenarios effectively.
+
+This guide explores advanced techniques for crawling dynamic content using Crawl4AI's session management features.
+
+---
+
+## Understanding Session-Based Crawling
+
+Session-based crawling allows you to reuse a persistent browser session across multiple actions. This means the same browser tab (or page object) is used throughout, enabling:
+
+1. **Efficient handling of dynamic content** without reloading the page.
+2. **JavaScript actions before and after crawling** (e.g., clicking buttons or scrolling).
+3. **State maintenance** for authenticated sessions or multi-step workflows.
+4. **Faster sequential crawling**, as it avoids reopening tabs or reallocating resources.
+
+**Note:** Session-based crawling is ideal for sequential operations, not parallel tasks.
+
+---
+
+## Basic Concepts
+
+Before diving into examples, here are some key concepts:
+
+- **Session ID**: A unique identifier for a browsing session. Use the same `session_id` across multiple requests to maintain state.
+- **BrowserConfig & CrawlerRunConfig**: These configuration objects control browser settings and crawling behavior.
+- **JavaScript Execution**: Use `js_code` to perform actions like clicking buttons.
+- **CSS Selectors**: Target specific elements for interaction or data extraction.
+- **Extraction Strategy**: Define rules to extract structured data.
+- **Wait Conditions**: Specify conditions to wait for before proceeding.
+
+---
+
+## Example 1: Basic Session-Based Crawling
+
+A simple example using session-based crawling:
+
+```python
+import asyncio
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.cache_context import CacheMode
+
+async def basic_session_crawl():
+ async with AsyncWebCrawler() as crawler:
+ session_id = "dynamic_content_session"
+ url = "https://example.com/dynamic-content"
+
+ for page in range(3):
+ config = CrawlerRunConfig(
+ url=url,
+ session_id=session_id,
+ js_code="document.querySelector('.load-more-button').click();" if page > 0 else None,
+ css_selector=".content-item",
+ cache_mode=CacheMode.BYPASS
+ )
+
+ result = await crawler.arun(config=config)
+ print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items")
+
+ await crawler.crawler_strategy.kill_session(session_id)
+
+asyncio.run(basic_session_crawl())
+```
+
+This example shows:
+1. Reusing the same `session_id` across multiple requests.
+2. Executing JavaScript to load more content dynamically.
+3. Properly closing the session to free resources.
+
+---
+
+## Advanced Technique 1: Custom Execution Hooks
+
+Use custom hooks to handle complex scenarios, such as waiting for content to load dynamically:
+
+```python
+async def advanced_session_crawl_with_hooks():
+ first_commit = ""
+
+ async def on_execution_started(page):
+ nonlocal first_commit
+ try:
+ while True:
+ await page.wait_for_selector("li.commit-item h4")
+ commit = await page.query_selector("li.commit-item h4")
+ commit = await commit.evaluate("(element) => element.textContent").strip()
+ if commit and commit != first_commit:
+ first_commit = commit
+ break
+ await asyncio.sleep(0.5)
+ except Exception as e:
+ print(f"Warning: New content didn't appear: {e}")
+
+ async with AsyncWebCrawler() as crawler:
+ session_id = "commit_session"
+ url = "https://github.com/example/repo/commits/main"
+ crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+
+ js_next_page = """document.querySelector('a.pagination-next').click();"""
+
+ for page in range(3):
+ config = CrawlerRunConfig(
+ url=url,
+ session_id=session_id,
+ js_code=js_next_page if page > 0 else None,
+ css_selector="li.commit-item",
+ js_only=page > 0,
+ cache_mode=CacheMode.BYPASS
+ )
+
+ result = await crawler.arun(config=config)
+ print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
+
+ await crawler.crawler_strategy.kill_session(session_id)
+
+asyncio.run(advanced_session_crawl_with_hooks())
+```
+
+This technique ensures new content loads before the next action.
+
+---
+
+## Advanced Technique 2: Integrated JavaScript Execution and Waiting
+
+Combine JavaScript execution and waiting logic for concise handling of dynamic content:
+
+```python
+async def integrated_js_and_wait_crawl():
+ async with AsyncWebCrawler() as crawler:
+ session_id = "integrated_session"
+ url = "https://github.com/example/repo/commits/main"
+
+ js_next_page_and_wait = """
+ (async () => {
+ const getCurrentCommit = () => document.querySelector('li.commit-item h4').textContent.trim();
+ const initialCommit = getCurrentCommit();
+ document.querySelector('a.pagination-next').click();
+ while (getCurrentCommit() === initialCommit) {
+ await new Promise(resolve => setTimeout(resolve, 100));
+ }
+ })();
+ """
+
+ for page in range(3):
+ config = CrawlerRunConfig(
+ url=url,
+ session_id=session_id,
+ js_code=js_next_page_and_wait if page > 0 else None,
+ css_selector="li.commit-item",
+ js_only=page > 0,
+ cache_mode=CacheMode.BYPASS
+ )
+
+ result = await crawler.arun(config=config)
+ print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
+
+ await crawler.crawler_strategy.kill_session(session_id)
+
+asyncio.run(integrated_js_and_wait_crawl())
+```
+
+---
+
+## Best Practices for Session-Based Crawling
+
+1. **Unique Session IDs**: Assign descriptive and unique `session_id` values.
+2. **Close Sessions**: Always clean up sessions with `kill_session` after use.
+3. **Error Handling**: Anticipate and handle errors gracefully.
+4. **Respect Websites**: Follow terms of service and robots.txt.
+5. **Delays**: Add delays to avoid overwhelming servers.
+6. **Optimize JavaScript**: Keep scripts concise for better performance.
+7. **Monitor Resources**: Track memory and CPU usage for long sessions.
+
+---
+
+## Conclusion
+
+Session-based crawling in Crawl4AI is a robust solution for handling dynamic content and multi-step workflows. By combining session management, JavaScript execution, and structured extraction strategies, you can effectively navigate and extract data from modern web applications. Always adhere to ethical web scraping practices and respect website policies.
\ No newline at end of file
diff --git a/docs/md_v2/advanced/session-management.md b/docs/md_v2/advanced/session-management.md
new file mode 100644
index 0000000000000000000000000000000000000000..e93482236f9191c5b1df58adeafc8812095c0afb
--- /dev/null
+++ b/docs/md_v2/advanced/session-management.md
@@ -0,0 +1,137 @@
+### Session Management
+
+Session management in Crawl4AI is a powerful feature that allows you to maintain state across multiple requests, making it particularly suitable for handling complex multi-step crawling tasks. It enables you to reuse the same browser tab (or page object) across sequential actions and crawls, which is beneficial for:
+
+- **Performing JavaScript actions before and after crawling.**
+- **Executing multiple sequential crawls faster** without needing to reopen tabs or allocate memory repeatedly.
+
+**Note:** This feature is designed for sequential workflows and is not suitable for parallel operations.
+
+---
+
+#### Basic Session Usage
+
+Use `BrowserConfig` and `CrawlerRunConfig` to maintain state with a `session_id`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+
+async with AsyncWebCrawler() as crawler:
+ session_id = "my_session"
+
+ # Define configurations
+ config1 = CrawlerRunConfig(url="https://example.com/page1", session_id=session_id)
+ config2 = CrawlerRunConfig(url="https://example.com/page2", session_id=session_id)
+
+ # First request
+ result1 = await crawler.arun(config=config1)
+
+ # Subsequent request using the same session
+ result2 = await crawler.arun(config=config2)
+
+ # Clean up when done
+ await crawler.crawler_strategy.kill_session(session_id)
+```
+
+---
+
+#### Dynamic Content with Sessions
+
+Here's an example of crawling GitHub commits across multiple pages while preserving session state:
+
+```python
+from crawl4ai.async_configs import CrawlerRunConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai.cache_context import CacheMode
+
+async def crawl_dynamic_content():
+ async with AsyncWebCrawler() as crawler:
+ session_id = "github_commits_session"
+ url = "https://github.com/microsoft/TypeScript/commits/main"
+ all_commits = []
+
+ # Define extraction schema
+ schema = {
+ "name": "Commit Extractor",
+ "baseSelector": "li.Box-sc-g0xbh4-0",
+ "fields": [{"name": "title", "selector": "h4.markdown-title", "type": "text"}],
+ }
+ extraction_strategy = JsonCssExtractionStrategy(schema)
+
+ # JavaScript and wait configurations
+ js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
+ wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
+
+ # Crawl multiple pages
+ for page in range(3):
+ config = CrawlerRunConfig(
+ url=url,
+ session_id=session_id,
+ extraction_strategy=extraction_strategy,
+ js_code=js_next_page if page > 0 else None,
+ wait_for=wait_for if page > 0 else None,
+ js_only=page > 0,
+ cache_mode=CacheMode.BYPASS
+ )
+
+ result = await crawler.arun(config=config)
+ if result.success:
+ commits = json.loads(result.extracted_content)
+ all_commits.extend(commits)
+ print(f"Page {page + 1}: Found {len(commits)} commits")
+
+ # Clean up session
+ await crawler.crawler_strategy.kill_session(session_id)
+ return all_commits
+```
+
+---
+
+#### Session Best Practices
+
+1. **Descriptive Session IDs**:
+ Use meaningful names for session IDs to organize workflows:
+ ```python
+ session_id = "login_flow_session"
+ session_id = "product_catalog_session"
+ ```
+
+2. **Resource Management**:
+ Always ensure sessions are cleaned up to free resources:
+ ```python
+ try:
+ # Your crawling code here
+ pass
+ finally:
+ await crawler.crawler_strategy.kill_session(session_id)
+ ```
+
+3. **State Maintenance**:
+ Reuse the session for subsequent actions within the same workflow:
+ ```python
+ # Step 1: Login
+ login_config = CrawlerRunConfig(
+ url="https://example.com/login",
+ session_id=session_id,
+ js_code="document.querySelector('form').submit();"
+ )
+ await crawler.arun(config=login_config)
+
+ # Step 2: Verify login success
+ dashboard_config = CrawlerRunConfig(
+ url="https://example.com/dashboard",
+ session_id=session_id,
+ wait_for="css:.user-profile" # Wait for authenticated content
+ )
+ result = await crawler.arun(config=dashboard_config)
+ ```
+
+---
+
+#### Common Use Cases for Sessions
+
+1. **Authentication Flows**: Login and interact with secured pages.
+2. **Pagination Handling**: Navigate through multiple pages.
+3. **Form Submissions**: Fill forms, submit, and process results.
+4. **Multi-step Processes**: Complete workflows that span multiple actions.
+5. **Dynamic Content Navigation**: Handle JavaScript-rendered or event-triggered content.
diff --git a/docs/md_v2/api/arun.md b/docs/md_v2/api/arun.md
new file mode 100644
index 0000000000000000000000000000000000000000..509991e583ecb8b4c0e85255de93caa319b56865
--- /dev/null
+++ b/docs/md_v2/api/arun.md
@@ -0,0 +1,244 @@
+# Complete Parameter Guide for arun()
+
+The following parameters can be passed to the `arun()` method. They are organized by their primary usage context and functionality.
+
+## Core Parameters
+
+```python
+await crawler.arun(
+ url="https://example.com", # Required: URL to crawl
+ verbose=True, # Enable detailed logging
+ cache_mode=CacheMode.ENABLED, # Control cache behavior
+ warmup=True # Whether to run warmup check
+)
+```
+
+## Cache Control
+
+```python
+from crawl4ai import CacheMode
+
+await crawler.arun(
+ cache_mode=CacheMode.ENABLED, # Normal caching (read/write)
+ # Other cache modes:
+ # cache_mode=CacheMode.DISABLED # No caching at all
+ # cache_mode=CacheMode.READ_ONLY # Only read from cache
+ # cache_mode=CacheMode.WRITE_ONLY # Only write to cache
+ # cache_mode=CacheMode.BYPASS # Skip cache for this operation
+)
+```
+
+## Content Processing Parameters
+
+### Text Processing
+```python
+await crawler.arun(
+ word_count_threshold=10, # Minimum words per content block
+ image_description_min_word_threshold=5, # Minimum words for image descriptions
+ only_text=False, # Extract only text content
+ excluded_tags=['form', 'nav'], # HTML tags to exclude
+ keep_data_attributes=False, # Preserve data-* attributes
+)
+```
+
+### Content Selection
+```python
+await crawler.arun(
+ css_selector=".main-content", # CSS selector for content extraction
+ remove_forms=True, # Remove all form elements
+ remove_overlay_elements=True, # Remove popups/modals/overlays
+)
+```
+
+### Link Handling
+```python
+await crawler.arun(
+ exclude_external_links=True, # Remove external links
+ exclude_social_media_links=True, # Remove social media links
+ exclude_external_images=True, # Remove external images
+ exclude_domains=["ads.example.com"], # Specific domains to exclude
+ social_media_domains=[ # Additional social media domains
+ "facebook.com",
+ "twitter.com",
+ "instagram.com"
+ ]
+)
+```
+
+## Browser Control Parameters
+
+### Basic Browser Settings
+```python
+await crawler.arun(
+ headless=True, # Run browser in headless mode
+ browser_type="chromium", # Browser engine: "chromium", "firefox", "webkit"
+ page_timeout=60000, # Page load timeout in milliseconds
+ user_agent="custom-agent", # Custom user agent
+)
+```
+
+### Navigation and Waiting
+```python
+await crawler.arun(
+ wait_for="css:.dynamic-content", # Wait for element/condition
+ delay_before_return_html=2.0, # Wait before returning HTML (seconds)
+)
+```
+
+### JavaScript Execution
+```python
+await crawler.arun(
+ js_code=[ # JavaScript to execute (string or list)
+ "window.scrollTo(0, document.body.scrollHeight);",
+ "document.querySelector('.load-more').click();"
+ ],
+ js_only=False, # Only execute JavaScript without reloading page
+)
+```
+
+### Anti-Bot Features
+```python
+await crawler.arun(
+ magic=True, # Enable all anti-detection features
+ simulate_user=True, # Simulate human behavior
+ override_navigator=True # Override navigator properties
+)
+```
+
+### Session Management
+```python
+await crawler.arun(
+ session_id="my_session", # Session identifier for persistent browsing
+)
+```
+
+### Screenshot Options
+```python
+await crawler.arun(
+ screenshot=True, # Take page screenshot
+ screenshot_wait_for=2.0, # Wait before screenshot (seconds)
+)
+```
+
+### Proxy Configuration
+```python
+await crawler.arun(
+ proxy="http://proxy.example.com:8080", # Simple proxy URL
+ proxy_config={ # Advanced proxy settings
+ "server": "http://proxy.example.com:8080",
+ "username": "user",
+ "password": "pass"
+ }
+)
+```
+
+## Content Extraction Parameters
+
+### Extraction Strategy
+```python
+await crawler.arun(
+ extraction_strategy=LLMExtractionStrategy(
+ provider="ollama/llama2",
+ schema=MySchema.schema(),
+ instruction="Extract specific data"
+ )
+)
+```
+
+### Chunking Strategy
+```python
+await crawler.arun(
+ chunking_strategy=RegexChunking(
+ patterns=[r'\n\n', r'\.\s+']
+ )
+)
+```
+
+### HTML to Text Options
+```python
+await crawler.arun(
+ html2text={
+ "ignore_links": False,
+ "ignore_images": False,
+ "escape_dot": False,
+ "body_width": 0,
+ "protect_links": True,
+ "unicode_snob": True
+ }
+)
+```
+
+## Debug Options
+```python
+await crawler.arun(
+ log_console=True, # Log browser console messages
+)
+```
+
+## Parameter Interactions and Notes
+
+1. **Cache and Performance Setup**
+ ```python
+ # Optimal caching for repeated crawls
+ await crawler.arun(
+ cache_mode=CacheMode.ENABLED,
+ word_count_threshold=10,
+ process_iframes=False
+ )
+ ```
+
+2. **Dynamic Content Handling**
+ ```python
+ # Handle lazy-loaded content
+ await crawler.arun(
+ js_code="window.scrollTo(0, document.body.scrollHeight);",
+ wait_for="css:.lazy-content",
+ delay_before_return_html=2.0,
+ cache_mode=CacheMode.WRITE_ONLY # Cache results after dynamic load
+ )
+ ```
+
+3. **Content Extraction Pipeline**
+ ```python
+ # Complete extraction setup
+ await crawler.arun(
+ css_selector=".main-content",
+ word_count_threshold=20,
+ extraction_strategy=my_strategy,
+ chunking_strategy=my_chunking,
+ process_iframes=True,
+ remove_overlay_elements=True,
+ cache_mode=CacheMode.ENABLED
+ )
+ ```
+
+## Best Practices
+
+1. **Performance Optimization**
+ ```python
+ await crawler.arun(
+ cache_mode=CacheMode.ENABLED, # Use full caching
+ word_count_threshold=10, # Filter out noise
+ process_iframes=False # Skip iframes if not needed
+ )
+ ```
+
+2. **Reliable Scraping**
+ ```python
+ await crawler.arun(
+ magic=True, # Enable anti-detection
+ delay_before_return_html=1.0, # Wait for dynamic content
+ page_timeout=60000, # Longer timeout for slow pages
+ cache_mode=CacheMode.WRITE_ONLY # Cache results after successful crawl
+ )
+ ```
+
+3. **Clean Content**
+ ```python
+ await crawler.arun(
+ remove_overlay_elements=True, # Remove popups
+ excluded_tags=['nav', 'aside'],# Remove unnecessary elements
+ keep_data_attributes=False, # Remove data attributes
+ cache_mode=CacheMode.ENABLED # Use cache for faster processing
+ )
+ ```
\ No newline at end of file
diff --git a/docs/md_v2/api/async-webcrawler.md b/docs/md_v2/api/async-webcrawler.md
new file mode 100644
index 0000000000000000000000000000000000000000..be95610153899a7eff50e2d133feb1c816dfcb70
--- /dev/null
+++ b/docs/md_v2/api/async-webcrawler.md
@@ -0,0 +1,320 @@
+# AsyncWebCrawler
+
+The `AsyncWebCrawler` class is the main interface for web crawling operations. It provides asynchronous web crawling capabilities with extensive configuration options.
+
+## Constructor
+
+```python
+AsyncWebCrawler(
+ # Browser Settings
+ browser_type: str = "chromium", # Options: "chromium", "firefox", "webkit"
+ headless: bool = True, # Run browser in headless mode
+ verbose: bool = False, # Enable verbose logging
+
+ # Cache Settings
+ always_by_pass_cache: bool = False, # Always bypass cache
+ base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), # Base directory for cache
+
+ # Network Settings
+ proxy: str = None, # Simple proxy URL
+ proxy_config: Dict = None, # Advanced proxy configuration
+
+ # Browser Behavior
+ sleep_on_close: bool = False, # Wait before closing browser
+
+ # Custom Settings
+ user_agent: str = None, # Custom user agent
+ headers: Dict[str, str] = {}, # Custom HTTP headers
+ js_code: Union[str, List[str]] = None, # Default JavaScript to execute
+)
+```
+
+### Parameters in Detail
+
+#### Browser Settings
+
+- **browser_type** (str, optional)
+ - Default: `"chromium"`
+ - Options: `"chromium"`, `"firefox"`, `"webkit"`
+ - Controls which browser engine to use
+ ```python
+ # Example: Using Firefox
+ crawler = AsyncWebCrawler(browser_type="firefox")
+ ```
+
+- **headless** (bool, optional)
+ - Default: `True`
+ - When `True`, browser runs without GUI
+ - Set to `False` for debugging
+ ```python
+ # Visible browser for debugging
+ crawler = AsyncWebCrawler(headless=False)
+ ```
+
+- **verbose** (bool, optional)
+ - Default: `False`
+ - Enables detailed logging
+ ```python
+ # Enable detailed logging
+ crawler = AsyncWebCrawler(verbose=True)
+ ```
+
+#### Cache Settings
+
+- **always_by_pass_cache** (bool, optional)
+ - Default: `False`
+ - When `True`, always fetches fresh content
+ ```python
+ # Always fetch fresh content
+ crawler = AsyncWebCrawler(always_by_pass_cache=True)
+ ```
+
+- **base_directory** (str, optional)
+ - Default: User's home directory
+ - Base path for cache storage
+ ```python
+ # Custom cache directory
+ crawler = AsyncWebCrawler(base_directory="/path/to/cache")
+ ```
+
+#### Network Settings
+
+- **proxy** (str, optional)
+ - Simple proxy URL
+ ```python
+ # Using simple proxy
+ crawler = AsyncWebCrawler(proxy="http://proxy.example.com:8080")
+ ```
+
+- **proxy_config** (Dict, optional)
+ - Advanced proxy configuration with authentication
+ ```python
+ # Advanced proxy with auth
+ crawler = AsyncWebCrawler(proxy_config={
+ "server": "http://proxy.example.com:8080",
+ "username": "user",
+ "password": "pass"
+ })
+ ```
+
+#### Browser Behavior
+
+- **sleep_on_close** (bool, optional)
+ - Default: `False`
+ - Adds delay before closing browser
+ ```python
+ # Wait before closing
+ crawler = AsyncWebCrawler(sleep_on_close=True)
+ ```
+
+#### Custom Settings
+
+- **user_agent** (str, optional)
+ - Custom user agent string
+ ```python
+ # Custom user agent
+ crawler = AsyncWebCrawler(
+ user_agent="Mozilla/5.0 (Custom Agent) Chrome/90.0"
+ )
+ ```
+
+- **headers** (Dict[str, str], optional)
+ - Custom HTTP headers
+ ```python
+ # Custom headers
+ crawler = AsyncWebCrawler(
+ headers={
+ "Accept-Language": "en-US",
+ "Custom-Header": "Value"
+ }
+ )
+ ```
+
+- **js_code** (Union[str, List[str]], optional)
+ - Default JavaScript to execute on each page
+ ```python
+ # Default JavaScript
+ crawler = AsyncWebCrawler(
+ js_code=[
+ "window.scrollTo(0, document.body.scrollHeight);",
+ "document.querySelector('.load-more').click();"
+ ]
+ )
+ ```
+
+## Methods
+
+### arun()
+
+The primary method for crawling web pages.
+
+```python
+async def arun(
+ # Required
+ url: str, # URL to crawl
+
+ # Content Selection
+ css_selector: str = None, # CSS selector for content
+ word_count_threshold: int = 10, # Minimum words per block
+
+ # Cache Control
+ bypass_cache: bool = False, # Bypass cache for this request
+
+ # Session Management
+ session_id: str = None, # Session identifier
+
+ # Screenshot Options
+ screenshot: bool = False, # Take screenshot
+ screenshot_wait_for: float = None, # Wait before screenshot
+
+ # Content Processing
+ process_iframes: bool = False, # Process iframe content
+ remove_overlay_elements: bool = False, # Remove popups/modals
+
+ # Anti-Bot Settings
+ simulate_user: bool = False, # Simulate human behavior
+ override_navigator: bool = False, # Override navigator properties
+ magic: bool = False, # Enable all anti-detection
+
+ # Content Filtering
+ excluded_tags: List[str] = None, # HTML tags to exclude
+ exclude_external_links: bool = False, # Remove external links
+ exclude_social_media_links: bool = False, # Remove social media links
+
+ # JavaScript Handling
+ js_code: Union[str, List[str]] = None, # JavaScript to execute
+ wait_for: str = None, # Wait condition
+
+ # Page Loading
+ page_timeout: int = 60000, # Page load timeout (ms)
+ delay_before_return_html: float = None, # Wait before return
+
+ # Extraction
+ extraction_strategy: ExtractionStrategy = None # Extraction strategy
+) -> CrawlResult:
+```
+
+### Usage Examples
+
+#### Basic Crawling
+```python
+async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(url="https://example.com")
+```
+
+#### Advanced Crawling
+```python
+async with AsyncWebCrawler(
+ browser_type="firefox",
+ verbose=True,
+ headers={"Custom-Header": "Value"}
+) as crawler:
+ result = await crawler.arun(
+ url="https://example.com",
+ css_selector=".main-content",
+ word_count_threshold=20,
+ process_iframes=True,
+ magic=True,
+ wait_for="css:.dynamic-content",
+ screenshot=True
+ )
+```
+
+#### Session Management
+```python
+async with AsyncWebCrawler() as crawler:
+ # First request
+ result1 = await crawler.arun(
+ url="https://example.com/login",
+ session_id="my_session"
+ )
+
+ # Subsequent request using same session
+ result2 = await crawler.arun(
+ url="https://example.com/protected",
+ session_id="my_session"
+ )
+```
+
+## Context Manager
+
+AsyncWebCrawler implements the async context manager protocol:
+
+```python
+async def __aenter__(self) -> 'AsyncWebCrawler':
+ # Initialize browser and resources
+ return self
+
+async def __aexit__(self, *args):
+ # Cleanup resources
+ pass
+```
+
+Always use AsyncWebCrawler with async context manager:
+```python
+async with AsyncWebCrawler() as crawler:
+ # Your crawling code here
+ pass
+```
+
+## Best Practices
+
+1. **Resource Management**
+```python
+# Always use context manager
+async with AsyncWebCrawler() as crawler:
+ # Crawler will be properly cleaned up
+ pass
+```
+
+2. **Error Handling**
+```python
+try:
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(url="https://example.com")
+ if not result.success:
+ print(f"Crawl failed: {result.error_message}")
+except Exception as e:
+ print(f"Error: {str(e)}")
+```
+
+3. **Performance Optimization**
+```python
+# Enable caching for better performance
+crawler = AsyncWebCrawler(
+ always_by_pass_cache=False,
+ verbose=True
+)
+```
+
+4. **Anti-Detection**
+```python
+# Maximum stealth
+crawler = AsyncWebCrawler(
+ headless=True,
+ user_agent="Mozilla/5.0...",
+ headers={"Accept-Language": "en-US"}
+)
+result = await crawler.arun(
+ url="https://example.com",
+ magic=True,
+ simulate_user=True
+)
+```
+
+## Note on Browser Types
+
+Each browser type has its characteristics:
+
+- **chromium**: Best overall compatibility
+- **firefox**: Good for specific use cases
+- **webkit**: Lighter weight, good for basic crawling
+
+Choose based on your specific needs:
+```python
+# High compatibility
+crawler = AsyncWebCrawler(browser_type="chromium")
+
+# Memory efficient
+crawler = AsyncWebCrawler(browser_type="webkit")
+```
\ No newline at end of file
diff --git a/docs/md_v2/api/crawl-config.md b/docs/md_v2/api/crawl-config.md
new file mode 100644
index 0000000000000000000000000000000000000000..928ae1e2f23b5d6a563fdfe253eb81c3057319c0
--- /dev/null
+++ b/docs/md_v2/api/crawl-config.md
@@ -0,0 +1,85 @@
+# CrawlerRunConfig Parameters Documentation
+
+## Content Processing Parameters
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `word_count_threshold` | int | 200 | Minimum word count threshold before processing content |
+| `extraction_strategy` | ExtractionStrategy | None | Strategy to extract structured data from crawled pages. When None, uses NoExtractionStrategy |
+| `chunking_strategy` | ChunkingStrategy | RegexChunking() | Strategy to chunk content before extraction |
+| `markdown_generator` | MarkdownGenerationStrategy | None | Strategy for generating markdown from extracted content |
+| `content_filter` | RelevantContentFilter | None | Optional filter to prune irrelevant content |
+| `only_text` | bool | False | If True, attempt to extract text-only content where applicable |
+| `css_selector` | str | None | CSS selector to extract a specific portion of the page |
+| `excluded_tags` | list[str] | [] | List of HTML tags to exclude from processing |
+| `keep_data_attributes` | bool | False | If True, retain `data-*` attributes while removing unwanted attributes |
+| `remove_forms` | bool | False | If True, remove all ` ")}value(){return this.buffer}span(e){
+ this.buffer+=``}}const s=(e={})=>{const n={children:[]}
+ ;return Object.assign(n,e),n};class o{constructor(){
+ this.rootNode=s(),this.stack=[this.rootNode]}get top(){
+ return this.stack[this.stack.length-1]}get root(){return this.rootNode}add(e){
+ this.top.children.push(e)}openNode(e){const n=s({scope:e})
+ ;this.add(n),this.stack.push(n)}closeNode(){
+ if(this.stack.length>1)return this.stack.pop()}closeAllNodes(){
+ for(;this.closeNode(););}toJSON(){return JSON.stringify(this.rootNode,null,4)}
+ walk(e){return this.constructor._walk(e,this.rootNode)}static _walk(e,n){
+ return"string"==typeof n?e.addText(n):n.children&&(e.openNode(n),
+ n.children.forEach((n=>this._walk(e,n))),e.closeNode(n)),e}static _collapse(e){
+ "string"!=typeof e&&e.children&&(e.children.every((e=>"string"==typeof e))?e.children=[e.children.join("")]:e.children.forEach((e=>{
+ o._collapse(e)})))}}class l extends o{constructor(e){super(),this.options=e}
+ addText(e){""!==e&&this.add(e)}startScope(e){this.openNode(e)}endScope(){
+ this.closeNode()}__addSublanguage(e,n){const t=e.root
+ ;n&&(t.scope="language:"+n),this.add(t)}toHTML(){
+ return new r(this,this.options).value()}finalize(){
+ return this.closeAllNodes(),!0}}function c(e){
+ return e?"string"==typeof e?e:e.source:null}function d(e){return b("(?=",e,")")}
+ function g(e){return b("(?:",e,")*")}function u(e){return b("(?:",e,")?")}
+ function b(...e){return e.map((e=>c(e))).join("")}function m(...e){const n=(e=>{
+ const n=e[e.length-1]
+ ;return"object"==typeof n&&n.constructor===Object?(e.splice(e.length-1,1),n):{}
+ })(e);return"("+(n.capture?"":"?:")+e.map((e=>c(e))).join("|")+")"}
+ function p(e){return RegExp(e.toString()+"|").exec("").length-1}
+ const _=/\[(?:[^\\\]]|\\.)*\]|\(\??|\\([1-9][0-9]*)|\\./
+ ;function h(e,{joinWith:n}){let t=0;return e.map((e=>{t+=1;const n=t
+ ;let a=c(e),i="";for(;a.length>0;){const e=_.exec(a);if(!e){i+=a;break}
+ i+=a.substring(0,e.index),
+ a=a.substring(e.index+e[0].length),"\\"===e[0][0]&&e[1]?i+="\\"+(Number(e[1])+n):(i+=e[0],
+ "("===e[0]&&t++)}return i})).map((e=>`(${e})`)).join(n)}
+ const f="[a-zA-Z]\\w*",E="[a-zA-Z_]\\w*",y="\\b\\d+(\\.\\d+)?",N="(-?)(\\b0[xX][a-fA-F0-9]+|(\\b\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)",w="\\b(0b[01]+)",v={
+ begin:"\\\\[\\s\\S]",relevance:0},O={scope:"string",begin:"'",end:"'",
+ illegal:"\\n",contains:[v]},k={scope:"string",begin:'"',end:'"',illegal:"\\n",
+ contains:[v]},x=(e,n,t={})=>{const i=a({scope:"comment",begin:e,end:n,
+ contains:[]},t);i.contains.push({scope:"doctag",
+ begin:"[ ]*(?=(TODO|FIXME|NOTE|BUG|OPTIMIZE|HACK|XXX):)",
+ end:/(TODO|FIXME|NOTE|BUG|OPTIMIZE|HACK|XXX):/,excludeBegin:!0,relevance:0})
+ ;const r=m("I","a","is","so","us","to","at","if","in","it","on",/[A-Za-z]+['](d|ve|re|ll|t|s|n)/,/[A-Za-z]+[-][a-z]+/,/[A-Za-z][a-z]{2,}/)
+ ;return i.contains.push({begin:b(/[ ]+/,"(",r,/[.]?[:]?([.][ ]|[ ])/,"){3}")}),i
+ },M=x("//","$"),S=x("/\\*","\\*/"),A=x("#","$");var C=Object.freeze({
+ __proto__:null,APOS_STRING_MODE:O,BACKSLASH_ESCAPE:v,BINARY_NUMBER_MODE:{
+ scope:"number",begin:w,relevance:0},BINARY_NUMBER_RE:w,COMMENT:x,
+ C_BLOCK_COMMENT_MODE:S,C_LINE_COMMENT_MODE:M,C_NUMBER_MODE:{scope:"number",
+ begin:N,relevance:0},C_NUMBER_RE:N,END_SAME_AS_BEGIN:e=>Object.assign(e,{
+ "on:begin":(e,n)=>{n.data._beginMatch=e[1]},"on:end":(e,n)=>{
+ n.data._beginMatch!==e[1]&&n.ignoreMatch()}}),HASH_COMMENT_MODE:A,IDENT_RE:f,
+ MATCH_NOTHING_RE:/\b\B/,METHOD_GUARD:{begin:"\\.\\s*"+E,relevance:0},
+ NUMBER_MODE:{scope:"number",begin:y,relevance:0},NUMBER_RE:y,
+ PHRASAL_WORDS_MODE:{
+ begin:/\b(a|an|the|are|I'm|isn't|don't|doesn't|won't|but|just|should|pretty|simply|enough|gonna|going|wtf|so|such|will|you|your|they|like|more)\b/
+ },QUOTE_STRING_MODE:k,REGEXP_MODE:{scope:"regexp",begin:/\/(?=[^/\n]*\/)/,
+ end:/\/[gimuy]*/,contains:[v,{begin:/\[/,end:/\]/,relevance:0,contains:[v]}]},
+ RE_STARTERS_RE:"!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|-|-=|/=|/|:|;|<<|<<=|<=|<|===|==|=|>>>=|>>=|>=|>>>|>>|>|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~",
+ SHEBANG:(e={})=>{const n=/^#![ ]*\//
+ ;return e.binary&&(e.begin=b(n,/.*\b/,e.binary,/\b.*/)),a({scope:"meta",begin:n,
+ end:/$/,relevance:0,"on:begin":(e,n)=>{0!==e.index&&n.ignoreMatch()}},e)},
+ TITLE_MODE:{scope:"title",begin:f,relevance:0},UNDERSCORE_IDENT_RE:E,
+ UNDERSCORE_TITLE_MODE:{scope:"title",begin:E,relevance:0}});function T(e,n){
+ "."===e.input[e.index-1]&&n.ignoreMatch()}function R(e,n){
+ void 0!==e.className&&(e.scope=e.className,delete e.className)}function D(e,n){
+ n&&e.beginKeywords&&(e.begin="\\b("+e.beginKeywords.split(" ").join("|")+")(?!\\.)(?=\\b|\\s)",
+ e.__beforeBegin=T,e.keywords=e.keywords||e.beginKeywords,delete e.beginKeywords,
+ void 0===e.relevance&&(e.relevance=0))}function I(e,n){
+ Array.isArray(e.illegal)&&(e.illegal=m(...e.illegal))}function L(e,n){
+ if(e.match){
+ if(e.begin||e.end)throw Error("begin & end are not supported with match")
+ ;e.begin=e.match,delete e.match}}function B(e,n){
+ void 0===e.relevance&&(e.relevance=1)}const $=(e,n)=>{if(!e.beforeMatch)return
+ ;if(e.starts)throw Error("beforeMatch cannot be used with starts")
+ ;const t=Object.assign({},e);Object.keys(e).forEach((n=>{delete e[n]
+ })),e.keywords=t.keywords,e.begin=b(t.beforeMatch,d(t.begin)),e.starts={
+ relevance:0,contains:[Object.assign(t,{endsParent:!0})]
+ },e.relevance=0,delete t.beforeMatch
+ },z=["of","and","for","in","not","or","if","then","parent","list","value"],F="keyword"
+ ;function U(e,n,t=F){const a=Object.create(null)
+ ;return"string"==typeof e?i(t,e.split(" ")):Array.isArray(e)?i(t,e):Object.keys(e).forEach((t=>{
+ Object.assign(a,U(e[t],n,t))})),a;function i(e,t){
+ n&&(t=t.map((e=>e.toLowerCase()))),t.forEach((n=>{const t=n.split("|")
+ ;a[t[0]]=[e,j(t[0],t[1])]}))}}function j(e,n){
+ return n?Number(n):(e=>z.includes(e.toLowerCase()))(e)?0:1}const P={},K=e=>{
+ console.error(e)},H=(e,...n)=>{console.log("WARN: "+e,...n)},q=(e,n)=>{
+ P[`${e}/${n}`]||(console.log(`Deprecated as of ${e}. ${n}`),P[`${e}/${n}`]=!0)
+ },G=Error();function Z(e,n,{key:t}){let a=0;const i=e[t],r={},s={}
+ ;for(let e=1;e<=n.length;e++)s[e+a]=i[e],r[e+a]=!0,a+=p(n[e-1])
+ ;e[t]=s,e[t]._emit=r,e[t]._multi=!0}function W(e){(e=>{
+ e.scope&&"object"==typeof e.scope&&null!==e.scope&&(e.beginScope=e.scope,
+ delete e.scope)})(e),"string"==typeof e.beginScope&&(e.beginScope={
+ _wrap:e.beginScope}),"string"==typeof e.endScope&&(e.endScope={_wrap:e.endScope
+ }),(e=>{if(Array.isArray(e.begin)){
+ if(e.skip||e.excludeBegin||e.returnBegin)throw K("skip, excludeBegin, returnBegin not compatible with beginScope: {}"),
+ G
+ ;if("object"!=typeof e.beginScope||null===e.beginScope)throw K("beginScope must be object"),
+ G;Z(e,e.begin,{key:"beginScope"}),e.begin=h(e.begin,{joinWith:""})}})(e),(e=>{
+ if(Array.isArray(e.end)){
+ if(e.skip||e.excludeEnd||e.returnEnd)throw K("skip, excludeEnd, returnEnd not compatible with endScope: {}"),
+ G
+ ;if("object"!=typeof e.endScope||null===e.endScope)throw K("endScope must be object"),
+ G;Z(e,e.end,{key:"endScope"}),e.end=h(e.end,{joinWith:""})}})(e)}function Q(e){
+ function n(n,t){
+ return RegExp(c(n),"m"+(e.case_insensitive?"i":"")+(e.unicodeRegex?"u":"")+(t?"g":""))
+ }class t{constructor(){
+ this.matchIndexes={},this.regexes=[],this.matchAt=1,this.position=0}
+ addRule(e,n){
+ n.position=this.position++,this.matchIndexes[this.matchAt]=n,this.regexes.push([n,e]),
+ this.matchAt+=p(e)+1}compile(){0===this.regexes.length&&(this.exec=()=>null)
+ ;const e=this.regexes.map((e=>e[1]));this.matcherRe=n(h(e,{joinWith:"|"
+ }),!0),this.lastIndex=0}exec(e){this.matcherRe.lastIndex=this.lastIndex
+ ;const n=this.matcherRe.exec(e);if(!n)return null
+ ;const t=n.findIndex(((e,n)=>n>0&&void 0!==e)),a=this.matchIndexes[t]
+ ;return n.splice(0,t),Object.assign(n,a)}}class i{constructor(){
+ this.rules=[],this.multiRegexes=[],
+ this.count=0,this.lastIndex=0,this.regexIndex=0}getMatcher(e){
+ if(this.multiRegexes[e])return this.multiRegexes[e];const n=new t
+ ;return this.rules.slice(e).forEach((([e,t])=>n.addRule(e,t))),
+ n.compile(),this.multiRegexes[e]=n,n}resumingScanAtSamePosition(){
+ return 0!==this.regexIndex}considerAll(){this.regexIndex=0}addRule(e,n){
+ this.rules.push([e,n]),"begin"===n.type&&this.count++}exec(e){
+ const n=this.getMatcher(this.regexIndex);n.lastIndex=this.lastIndex
+ ;let t=n.exec(e)
+ ;if(this.resumingScanAtSamePosition())if(t&&t.index===this.lastIndex);else{
+ const n=this.getMatcher(0);n.lastIndex=this.lastIndex+1,t=n.exec(e)}
+ return t&&(this.regexIndex+=t.position+1,
+ this.regexIndex===this.count&&this.considerAll()),t}}
+ if(e.compilerExtensions||(e.compilerExtensions=[]),
+ e.contains&&e.contains.includes("self"))throw Error("ERR: contains `self` is not supported at the top-level of a language. See documentation.")
+ ;return e.classNameAliases=a(e.classNameAliases||{}),function t(r,s){const o=r
+ ;if(r.isCompiled)return o
+ ;[R,L,W,$].forEach((e=>e(r,s))),e.compilerExtensions.forEach((e=>e(r,s))),
+ r.__beforeBegin=null,[D,I,B].forEach((e=>e(r,s))),r.isCompiled=!0;let l=null
+ ;return"object"==typeof r.keywords&&r.keywords.$pattern&&(r.keywords=Object.assign({},r.keywords),
+ l=r.keywords.$pattern,
+ delete r.keywords.$pattern),l=l||/\w+/,r.keywords&&(r.keywords=U(r.keywords,e.case_insensitive)),
+ o.keywordPatternRe=n(l,!0),
+ s&&(r.begin||(r.begin=/\B|\b/),o.beginRe=n(o.begin),r.end||r.endsWithParent||(r.end=/\B|\b/),
+ r.end&&(o.endRe=n(o.end)),
+ o.terminatorEnd=c(o.end)||"",r.endsWithParent&&s.terminatorEnd&&(o.terminatorEnd+=(r.end?"|":"")+s.terminatorEnd)),
+ r.illegal&&(o.illegalRe=n(r.illegal)),
+ r.contains||(r.contains=[]),r.contains=[].concat(...r.contains.map((e=>(e=>(e.variants&&!e.cachedVariants&&(e.cachedVariants=e.variants.map((n=>a(e,{
+ variants:null},n)))),e.cachedVariants?e.cachedVariants:X(e)?a(e,{
+ starts:e.starts?a(e.starts):null
+ }):Object.isFrozen(e)?a(e):e))("self"===e?r:e)))),r.contains.forEach((e=>{t(e,o)
+ })),r.starts&&t(r.starts,s),o.matcher=(e=>{const n=new i
+ ;return e.contains.forEach((e=>n.addRule(e.begin,{rule:e,type:"begin"
+ }))),e.terminatorEnd&&n.addRule(e.terminatorEnd,{type:"end"
+ }),e.illegal&&n.addRule(e.illegal,{type:"illegal"}),n})(o),o}(e)}function X(e){
+ return!!e&&(e.endsWithParent||X(e.starts))}class V extends Error{
+ constructor(e,n){super(e),this.name="HTMLInjectionError",this.html=n}}
+ const J=t,Y=a,ee=Symbol("nomatch"),ne=t=>{
+ const a=Object.create(null),i=Object.create(null),r=[];let s=!0
+ ;const o="Could not find the language '{}', did you forget to load/include a language module?",c={
+ disableAutodetect:!0,name:"Plain text",contains:[]};let p={
+ ignoreUnescapedHTML:!1,throwUnescapedHTML:!1,noHighlightRe:/^(no-?highlight)$/i,
+ languageDetectRe:/\blang(?:uage)?-([\w-]+)\b/i,classPrefix:"hljs-",
+ cssSelector:"pre code",languages:null,__emitter:l};function _(e){
+ return p.noHighlightRe.test(e)}function h(e,n,t){let a="",i=""
+ ;"object"==typeof n?(a=e,
+ t=n.ignoreIllegals,i=n.language):(q("10.7.0","highlight(lang, code, ...args) has been deprecated."),
+ q("10.7.0","Please use highlight(code, options) instead.\nhttps://github.com/highlightjs/highlight.js/issues/2277"),
+ i=e,a=n),void 0===t&&(t=!0);const r={code:a,language:i};x("before:highlight",r)
+ ;const s=r.result?r.result:f(r.language,r.code,t)
+ ;return s.code=r.code,x("after:highlight",s),s}function f(e,t,i,r){
+ const l=Object.create(null);function c(){if(!x.keywords)return void S.addText(A)
+ ;let e=0;x.keywordPatternRe.lastIndex=0;let n=x.keywordPatternRe.exec(A),t=""
+ ;for(;n;){t+=A.substring(e,n.index)
+ ;const i=w.case_insensitive?n[0].toLowerCase():n[0],r=(a=i,x.keywords[a]);if(r){
+ const[e,a]=r
+ ;if(S.addText(t),t="",l[i]=(l[i]||0)+1,l[i]<=7&&(C+=a),e.startsWith("_"))t+=n[0];else{
+ const t=w.classNameAliases[e]||e;g(n[0],t)}}else t+=n[0]
+ ;e=x.keywordPatternRe.lastIndex,n=x.keywordPatternRe.exec(A)}var a
+ ;t+=A.substring(e),S.addText(t)}function d(){null!=x.subLanguage?(()=>{
+ if(""===A)return;let e=null;if("string"==typeof x.subLanguage){
+ if(!a[x.subLanguage])return void S.addText(A)
+ ;e=f(x.subLanguage,A,!0,M[x.subLanguage]),M[x.subLanguage]=e._top
+ }else e=E(A,x.subLanguage.length?x.subLanguage:null)
+ ;x.relevance>0&&(C+=e.relevance),S.__addSublanguage(e._emitter,e.language)
+ })():c(),A=""}function g(e,n){
+ ""!==e&&(S.startScope(n),S.addText(e),S.endScope())}function u(e,n){let t=1
+ ;const a=n.length-1;for(;t<=a;){if(!e._emit[t]){t++;continue}
+ const a=w.classNameAliases[e[t]]||e[t],i=n[t];a?g(i,a):(A=i,c(),A=""),t++}}
+ function b(e,n){
+ return e.scope&&"string"==typeof e.scope&&S.openNode(w.classNameAliases[e.scope]||e.scope),
+ e.beginScope&&(e.beginScope._wrap?(g(A,w.classNameAliases[e.beginScope._wrap]||e.beginScope._wrap),
+ A=""):e.beginScope._multi&&(u(e.beginScope,n),A="")),x=Object.create(e,{parent:{
+ value:x}}),x}function m(e,t,a){let i=((e,n)=>{const t=e&&e.exec(n)
+ ;return t&&0===t.index})(e.endRe,a);if(i){if(e["on:end"]){const a=new n(e)
+ ;e["on:end"](t,a),a.isMatchIgnored&&(i=!1)}if(i){
+ for(;e.endsParent&&e.parent;)e=e.parent;return e}}
+ if(e.endsWithParent)return m(e.parent,t,a)}function _(e){
+ return 0===x.matcher.regexIndex?(A+=e[0],1):(D=!0,0)}function h(e){
+ const n=e[0],a=t.substring(e.index),i=m(x,e,a);if(!i)return ee;const r=x
+ ;x.endScope&&x.endScope._wrap?(d(),
+ g(n,x.endScope._wrap)):x.endScope&&x.endScope._multi?(d(),
+ u(x.endScope,e)):r.skip?A+=n:(r.returnEnd||r.excludeEnd||(A+=n),
+ d(),r.excludeEnd&&(A=n));do{
+ x.scope&&S.closeNode(),x.skip||x.subLanguage||(C+=x.relevance),x=x.parent
+ }while(x!==i.parent);return i.starts&&b(i.starts,e),r.returnEnd?0:n.length}
+ let y={};function N(a,r){const o=r&&r[0];if(A+=a,null==o)return d(),0
+ ;if("begin"===y.type&&"end"===r.type&&y.index===r.index&&""===o){
+ if(A+=t.slice(r.index,r.index+1),!s){const n=Error(`0 width match regex (${e})`)
+ ;throw n.languageName=e,n.badRule=y.rule,n}return 1}
+ if(y=r,"begin"===r.type)return(e=>{
+ const t=e[0],a=e.rule,i=new n(a),r=[a.__beforeBegin,a["on:begin"]]
+ ;for(const n of r)if(n&&(n(e,i),i.isMatchIgnored))return _(t)
+ ;return a.skip?A+=t:(a.excludeBegin&&(A+=t),
+ d(),a.returnBegin||a.excludeBegin||(A=t)),b(a,e),a.returnBegin?0:t.length})(r)
+ ;if("illegal"===r.type&&!i){
+ const e=Error('Illegal lexeme "'+o+'" for mode "'+(x.scope||"")+'"')
+ ;throw e.mode=x,e}if("end"===r.type){const e=h(r);if(e!==ee)return e}
+ if("illegal"===r.type&&""===o)return 1
+ ;if(R>1e5&&R>3*r.index)throw Error("potential infinite loop, way more iterations than matches")
+ ;return A+=o,o.length}const w=v(e)
+ ;if(!w)throw K(o.replace("{}",e)),Error('Unknown language: "'+e+'"')
+ ;const O=Q(w);let k="",x=r||O;const M={},S=new p.__emitter(p);(()=>{const e=[]
+ ;for(let n=x;n!==w;n=n.parent)n.scope&&e.unshift(n.scope)
+ ;e.forEach((e=>S.openNode(e)))})();let A="",C=0,T=0,R=0,D=!1;try{
+ if(w.__emitTokens)w.__emitTokens(t,S);else{for(x.matcher.considerAll();;){
+ R++,D?D=!1:x.matcher.considerAll(),x.matcher.lastIndex=T
+ ;const e=x.matcher.exec(t);if(!e)break;const n=N(t.substring(T,e.index),e)
+ ;T=e.index+n}N(t.substring(T))}return S.finalize(),k=S.toHTML(),{language:e,
+ value:k,relevance:C,illegal:!1,_emitter:S,_top:x}}catch(n){
+ if(n.message&&n.message.includes("Illegal"))return{language:e,value:J(t),
+ illegal:!0,relevance:0,_illegalBy:{message:n.message,index:T,
+ context:t.slice(T-100,T+100),mode:n.mode,resultSoFar:k},_emitter:S};if(s)return{
+ language:e,value:J(t),illegal:!1,relevance:0,errorRaised:n,_emitter:S,_top:x}
+ ;throw n}}function E(e,n){n=n||p.languages||Object.keys(a);const t=(e=>{
+ const n={value:J(e),illegal:!1,relevance:0,_top:c,_emitter:new p.__emitter(p)}
+ ;return n._emitter.addText(e),n})(e),i=n.filter(v).filter(k).map((n=>f(n,e,!1)))
+ ;i.unshift(t);const r=i.sort(((e,n)=>{
+ if(e.relevance!==n.relevance)return n.relevance-e.relevance
+ ;if(e.language&&n.language){if(v(e.language).supersetOf===n.language)return 1
+ ;if(v(n.language).supersetOf===e.language)return-1}return 0})),[s,o]=r,l=s
+ ;return l.secondBest=o,l}function y(e){let n=null;const t=(e=>{
+ let n=e.className+" ";n+=e.parentNode?e.parentNode.className:""
+ ;const t=p.languageDetectRe.exec(n);if(t){const n=v(t[1])
+ ;return n||(H(o.replace("{}",t[1])),
+ H("Falling back to no-highlight mode for this block.",e)),n?t[1]:"no-highlight"}
+ return n.split(/\s+/).find((e=>_(e)||v(e)))})(e);if(_(t))return
+ ;if(x("before:highlightElement",{el:e,language:t
+ }),e.dataset.highlighted)return void console.log("Element previously highlighted. To highlight again, first unset `dataset.highlighted`.",e)
+ ;if(e.children.length>0&&(p.ignoreUnescapedHTML||(console.warn("One of your code blocks includes unescaped HTML. This is a potentially serious security risk."),
+ console.warn("https://github.com/highlightjs/highlight.js/wiki/security"),
+ console.warn("The element with unescaped HTML:"),
+ console.warn(e)),p.throwUnescapedHTML))throw new V("One of your code blocks includes unescaped HTML.",e.innerHTML)
+ ;n=e;const a=n.textContent,r=t?h(a,{language:t,ignoreIllegals:!0}):E(a)
+ ;e.innerHTML=r.value,e.dataset.highlighted="yes",((e,n,t)=>{const a=n&&i[n]||t
+ ;e.classList.add("hljs"),e.classList.add("language-"+a)
+ })(e,t,r.language),e.result={language:r.language,re:r.relevance,
+ relevance:r.relevance},r.secondBest&&(e.secondBest={
+ language:r.secondBest.language,relevance:r.secondBest.relevance
+ }),x("after:highlightElement",{el:e,result:r,text:a})}let N=!1;function w(){
+ "loading"!==document.readyState?document.querySelectorAll(p.cssSelector).forEach(y):N=!0
+ }function v(e){return e=(e||"").toLowerCase(),a[e]||a[i[e]]}
+ function O(e,{languageName:n}){"string"==typeof e&&(e=[e]),e.forEach((e=>{
+ i[e.toLowerCase()]=n}))}function k(e){const n=v(e)
+ ;return n&&!n.disableAutodetect}function x(e,n){const t=e;r.forEach((e=>{
+ e[t]&&e[t](n)}))}
+ "undefined"!=typeof window&&window.addEventListener&&window.addEventListener("DOMContentLoaded",(()=>{
+ N&&w()}),!1),Object.assign(t,{highlight:h,highlightAuto:E,highlightAll:w,
+ highlightElement:y,
+ highlightBlock:e=>(q("10.7.0","highlightBlock will be removed entirely in v12.0"),
+ q("10.7.0","Please use highlightElement now."),y(e)),configure:e=>{p=Y(p,e)},
+ initHighlighting:()=>{
+ w(),q("10.6.0","initHighlighting() deprecated. Use highlightAll() now.")},
+ initHighlightingOnLoad:()=>{
+ w(),q("10.6.0","initHighlightingOnLoad() deprecated. Use highlightAll() now.")
+ },registerLanguage:(e,n)=>{let i=null;try{i=n(t)}catch(n){
+ if(K("Language definition for '{}' could not be registered.".replace("{}",e)),
+ !s)throw n;K(n),i=c}
+ i.name||(i.name=e),a[e]=i,i.rawDefinition=n.bind(null,t),i.aliases&&O(i.aliases,{
+ languageName:e})},unregisterLanguage:e=>{delete a[e]
+ ;for(const n of Object.keys(i))i[n]===e&&delete i[n]},
+ listLanguages:()=>Object.keys(a),getLanguage:v,registerAliases:O,
+ autoDetection:k,inherit:Y,addPlugin:e=>{(e=>{
+ e["before:highlightBlock"]&&!e["before:highlightElement"]&&(e["before:highlightElement"]=n=>{
+ e["before:highlightBlock"](Object.assign({block:n.el},n))
+ }),e["after:highlightBlock"]&&!e["after:highlightElement"]&&(e["after:highlightElement"]=n=>{
+ e["after:highlightBlock"](Object.assign({block:n.el},n))})})(e),r.push(e)},
+ removePlugin:e=>{const n=r.indexOf(e);-1!==n&&r.splice(n,1)}}),t.debugMode=()=>{
+ s=!1},t.safeMode=()=>{s=!0},t.versionString="11.9.0",t.regex={concat:b,
+ lookahead:d,either:m,optional:u,anyNumberOfTimes:g}
+ ;for(const n in C)"object"==typeof C[n]&&e(C[n]);return Object.assign(t,C),t
+ },te=ne({});te.newInstance=()=>ne({});var ae=te;const ie=e=>({IMPORTANT:{
+ scope:"meta",begin:"!important"},BLOCK_COMMENT:e.C_BLOCK_COMMENT_MODE,HEXCOLOR:{
+ scope:"number",begin:/#(([0-9a-fA-F]{3,4})|(([0-9a-fA-F]{2}){3,4}))\b/},
+ FUNCTION_DISPATCH:{className:"built_in",begin:/[\w-]+(?=\()/},
+ ATTRIBUTE_SELECTOR_MODE:{scope:"selector-attr",begin:/\[/,end:/\]/,illegal:"$",
+ contains:[e.APOS_STRING_MODE,e.QUOTE_STRING_MODE]},CSS_NUMBER_MODE:{
+ scope:"number",
+ begin:e.NUMBER_RE+"(%|em|ex|ch|rem|vw|vh|vmin|vmax|cm|mm|in|pt|pc|px|deg|grad|rad|turn|s|ms|Hz|kHz|dpi|dpcm|dppx)?",
+ relevance:0},CSS_VARIABLE:{className:"attr",begin:/--[A-Za-z_][A-Za-z0-9_-]*/}
+ }),re=["a","abbr","address","article","aside","audio","b","blockquote","body","button","canvas","caption","cite","code","dd","del","details","dfn","div","dl","dt","em","fieldset","figcaption","figure","footer","form","h1","h2","h3","h4","h5","h6","header","hgroup","html","i","iframe","img","input","ins","kbd","label","legend","li","main","mark","menu","nav","object","ol","p","q","quote","samp","section","span","strong","summary","sup","table","tbody","td","textarea","tfoot","th","thead","time","tr","ul","var","video"],se=["any-hover","any-pointer","aspect-ratio","color","color-gamut","color-index","device-aspect-ratio","device-height","device-width","display-mode","forced-colors","grid","height","hover","inverted-colors","monochrome","orientation","overflow-block","overflow-inline","pointer","prefers-color-scheme","prefers-contrast","prefers-reduced-motion","prefers-reduced-transparency","resolution","scan","scripting","update","width","min-width","max-width","min-height","max-height"],oe=["active","any-link","blank","checked","current","default","defined","dir","disabled","drop","empty","enabled","first","first-child","first-of-type","fullscreen","future","focus","focus-visible","focus-within","has","host","host-context","hover","indeterminate","in-range","invalid","is","lang","last-child","last-of-type","left","link","local-link","not","nth-child","nth-col","nth-last-child","nth-last-col","nth-last-of-type","nth-of-type","only-child","only-of-type","optional","out-of-range","past","placeholder-shown","read-only","read-write","required","right","root","scope","target","target-within","user-invalid","valid","visited","where"],le=["after","backdrop","before","cue","cue-region","first-letter","first-line","grammar-error","marker","part","placeholder","selection","slotted","spelling-error"],ce=["align-content","align-items","align-self","all","animation","animation-delay","animation-direction","animation-duration","animation-fill-mode","animation-iteration-count","animation-name","animation-play-state","animation-timing-function","backface-visibility","background","background-attachment","background-blend-mode","background-clip","background-color","background-image","background-origin","background-position","background-repeat","background-size","block-size","border","border-block","border-block-color","border-block-end","border-block-end-color","border-block-end-style","border-block-end-width","border-block-start","border-block-start-color","border-block-start-style","border-block-start-width","border-block-style","border-block-width","border-bottom","border-bottom-color","border-bottom-left-radius","border-bottom-right-radius","border-bottom-style","border-bottom-width","border-collapse","border-color","border-image","border-image-outset","border-image-repeat","border-image-slice","border-image-source","border-image-width","border-inline","border-inline-color","border-inline-end","border-inline-end-color","border-inline-end-style","border-inline-end-width","border-inline-start","border-inline-start-color","border-inline-start-style","border-inline-start-width","border-inline-style","border-inline-width","border-left","border-left-color","border-left-style","border-left-width","border-radius","border-right","border-right-color","border-right-style","border-right-width","border-spacing","border-style","border-top","border-top-color","border-top-left-radius","border-top-right-radius","border-top-style","border-top-width","border-width","bottom","box-decoration-break","box-shadow","box-sizing","break-after","break-before","break-inside","caption-side","caret-color","clear","clip","clip-path","clip-rule","color","column-count","column-fill","column-gap","column-rule","column-rule-color","column-rule-style","column-rule-width","column-span","column-width","columns","contain","content","content-visibility","counter-increment","counter-reset","cue","cue-after","cue-before","cursor","direction","display","empty-cells","filter","flex","flex-basis","flex-direction","flex-flow","flex-grow","flex-shrink","flex-wrap","float","flow","font","font-display","font-family","font-feature-settings","font-kerning","font-language-override","font-size","font-size-adjust","font-smoothing","font-stretch","font-style","font-synthesis","font-variant","font-variant-caps","font-variant-east-asian","font-variant-ligatures","font-variant-numeric","font-variant-position","font-variation-settings","font-weight","gap","glyph-orientation-vertical","grid","grid-area","grid-auto-columns","grid-auto-flow","grid-auto-rows","grid-column","grid-column-end","grid-column-start","grid-gap","grid-row","grid-row-end","grid-row-start","grid-template","grid-template-areas","grid-template-columns","grid-template-rows","hanging-punctuation","height","hyphens","icon","image-orientation","image-rendering","image-resolution","ime-mode","inline-size","isolation","justify-content","left","letter-spacing","line-break","line-height","list-style","list-style-image","list-style-position","list-style-type","margin","margin-block","margin-block-end","margin-block-start","margin-bottom","margin-inline","margin-inline-end","margin-inline-start","margin-left","margin-right","margin-top","marks","mask","mask-border","mask-border-mode","mask-border-outset","mask-border-repeat","mask-border-slice","mask-border-source","mask-border-width","mask-clip","mask-composite","mask-image","mask-mode","mask-origin","mask-position","mask-repeat","mask-size","mask-type","max-block-size","max-height","max-inline-size","max-width","min-block-size","min-height","min-inline-size","min-width","mix-blend-mode","nav-down","nav-index","nav-left","nav-right","nav-up","none","normal","object-fit","object-position","opacity","order","orphans","outline","outline-color","outline-offset","outline-style","outline-width","overflow","overflow-wrap","overflow-x","overflow-y","padding","padding-block","padding-block-end","padding-block-start","padding-bottom","padding-inline","padding-inline-end","padding-inline-start","padding-left","padding-right","padding-top","page-break-after","page-break-before","page-break-inside","pause","pause-after","pause-before","perspective","perspective-origin","pointer-events","position","quotes","resize","rest","rest-after","rest-before","right","row-gap","scroll-margin","scroll-margin-block","scroll-margin-block-end","scroll-margin-block-start","scroll-margin-bottom","scroll-margin-inline","scroll-margin-inline-end","scroll-margin-inline-start","scroll-margin-left","scroll-margin-right","scroll-margin-top","scroll-padding","scroll-padding-block","scroll-padding-block-end","scroll-padding-block-start","scroll-padding-bottom","scroll-padding-inline","scroll-padding-inline-end","scroll-padding-inline-start","scroll-padding-left","scroll-padding-right","scroll-padding-top","scroll-snap-align","scroll-snap-stop","scroll-snap-type","scrollbar-color","scrollbar-gutter","scrollbar-width","shape-image-threshold","shape-margin","shape-outside","speak","speak-as","src","tab-size","table-layout","text-align","text-align-all","text-align-last","text-combine-upright","text-decoration","text-decoration-color","text-decoration-line","text-decoration-style","text-emphasis","text-emphasis-color","text-emphasis-position","text-emphasis-style","text-indent","text-justify","text-orientation","text-overflow","text-rendering","text-shadow","text-transform","text-underline-position","top","transform","transform-box","transform-origin","transform-style","transition","transition-delay","transition-duration","transition-property","transition-timing-function","unicode-bidi","vertical-align","visibility","voice-balance","voice-duration","voice-family","voice-pitch","voice-range","voice-rate","voice-stress","voice-volume","white-space","widows","width","will-change","word-break","word-spacing","word-wrap","writing-mode","z-index"].reverse(),de=oe.concat(le)
+ ;var ge="[0-9](_*[0-9])*",ue=`\\.(${ge})`,be="[0-9a-fA-F](_*[0-9a-fA-F])*",me={
+ className:"number",variants:[{
+ begin:`(\\b(${ge})((${ue})|\\.)?|(${ue}))[eE][+-]?(${ge})[fFdD]?\\b`},{
+ begin:`\\b(${ge})((${ue})[fFdD]?\\b|\\.([fFdD]\\b)?)`},{
+ begin:`(${ue})[fFdD]?\\b`},{begin:`\\b(${ge})[fFdD]\\b`},{
+ begin:`\\b0[xX]((${be})\\.?|(${be})?\\.(${be}))[pP][+-]?(${ge})[fFdD]?\\b`},{
+ begin:"\\b(0|[1-9](_*[0-9])*)[lL]?\\b"},{begin:`\\b0[xX](${be})[lL]?\\b`},{
+ begin:"\\b0(_*[0-7])*[lL]?\\b"},{begin:"\\b0[bB][01](_*[01])*[lL]?\\b"}],
+ relevance:0};function pe(e,n,t){return-1===t?"":e.replace(n,(a=>pe(e,n,t-1)))}
+ const _e="[A-Za-z$_][0-9A-Za-z$_]*",he=["as","in","of","if","for","while","finally","var","new","function","do","return","void","else","break","catch","instanceof","with","throw","case","default","try","switch","continue","typeof","delete","let","yield","const","class","debugger","async","await","static","import","from","export","extends"],fe=["true","false","null","undefined","NaN","Infinity"],Ee=["Object","Function","Boolean","Symbol","Math","Date","Number","BigInt","String","RegExp","Array","Float32Array","Float64Array","Int8Array","Uint8Array","Uint8ClampedArray","Int16Array","Int32Array","Uint16Array","Uint32Array","BigInt64Array","BigUint64Array","Set","Map","WeakSet","WeakMap","ArrayBuffer","SharedArrayBuffer","Atomics","DataView","JSON","Promise","Generator","GeneratorFunction","AsyncFunction","Reflect","Proxy","Intl","WebAssembly"],ye=["Error","EvalError","InternalError","RangeError","ReferenceError","SyntaxError","TypeError","URIError"],Ne=["setInterval","setTimeout","clearInterval","clearTimeout","require","exports","eval","isFinite","isNaN","parseFloat","parseInt","decodeURI","decodeURIComponent","encodeURI","encodeURIComponent","escape","unescape"],we=["arguments","this","super","console","window","document","localStorage","sessionStorage","module","global"],ve=[].concat(Ne,Ee,ye)
+ ;function Oe(e){const n=e.regex,t=_e,a={begin:/<[A-Za-z0-9\\._:-]+/,
+ end:/\/[A-Za-z0-9\\._:-]+>|\/>/,isTrulyOpeningTag:(e,n)=>{
+ const t=e[0].length+e.index,a=e.input[t]
+ ;if("<"===a||","===a)return void n.ignoreMatch();let i
+ ;">"===a&&(((e,{after:n})=>{const t=""+e[0].slice(1)
+ ;return-1!==e.input.indexOf(t,n)})(e,{after:t})||n.ignoreMatch())
+ ;const r=e.input.substring(t)
+ ;((i=r.match(/^\s*=/))||(i=r.match(/^\s+extends\s+/))&&0===i.index)&&n.ignoreMatch()
+ }},i={$pattern:_e,keyword:he,literal:fe,built_in:ve,"variable.language":we
+ },r="[0-9](_?[0-9])*",s=`\\.(${r})`,o="0|[1-9](_?[0-9])*|0[0-7]*[89][0-9]*",l={
+ className:"number",variants:[{
+ begin:`(\\b(${o})((${s})|\\.)?|(${s}))[eE][+-]?(${r})\\b`},{
+ begin:`\\b(${o})\\b((${s})\\b|\\.)?|(${s})\\b`},{
+ begin:"\\b(0|[1-9](_?[0-9])*)n\\b"},{
+ begin:"\\b0[xX][0-9a-fA-F](_?[0-9a-fA-F])*n?\\b"},{
+ begin:"\\b0[bB][0-1](_?[0-1])*n?\\b"},{begin:"\\b0[oO][0-7](_?[0-7])*n?\\b"},{
+ begin:"\\b0[0-7]+n?\\b"}],relevance:0},c={className:"subst",begin:"\\$\\{",
+ end:"\\}",keywords:i,contains:[]},d={begin:"html`",end:"",starts:{end:"`",
+ returnEnd:!1,contains:[e.BACKSLASH_ESCAPE,c],subLanguage:"xml"}},g={
+ begin:"css`",end:"",starts:{end:"`",returnEnd:!1,
+ contains:[e.BACKSLASH_ESCAPE,c],subLanguage:"css"}},u={begin:"gql`",end:"",
+ starts:{end:"`",returnEnd:!1,contains:[e.BACKSLASH_ESCAPE,c],
+ subLanguage:"graphql"}},b={className:"string",begin:"`",end:"`",
+ contains:[e.BACKSLASH_ESCAPE,c]},m={className:"comment",
+ variants:[e.COMMENT(/\/\*\*(?!\/)/,"\\*/",{relevance:0,contains:[{
+ begin:"(?=@[A-Za-z]+)",relevance:0,contains:[{className:"doctag",
+ begin:"@[A-Za-z]+"},{className:"type",begin:"\\{",end:"\\}",excludeEnd:!0,
+ excludeBegin:!0,relevance:0},{className:"variable",begin:t+"(?=\\s*(-)|$)",
+ endsParent:!0,relevance:0},{begin:/(?=[^\n])\s/,relevance:0}]}]
+ }),e.C_BLOCK_COMMENT_MODE,e.C_LINE_COMMENT_MODE]
+ },p=[e.APOS_STRING_MODE,e.QUOTE_STRING_MODE,d,g,u,b,{match:/\$\d+/},l]
+ ;c.contains=p.concat({begin:/\{/,end:/\}/,keywords:i,contains:["self"].concat(p)
+ });const _=[].concat(m,c.contains),h=_.concat([{begin:/\(/,end:/\)/,keywords:i,
+ contains:["self"].concat(_)}]),f={className:"params",begin:/\(/,end:/\)/,
+ excludeBegin:!0,excludeEnd:!0,keywords:i,contains:h},E={variants:[{
+ match:[/class/,/\s+/,t,/\s+/,/extends/,/\s+/,n.concat(t,"(",n.concat(/\./,t),")*")],
+ scope:{1:"keyword",3:"title.class",5:"keyword",7:"title.class.inherited"}},{
+ match:[/class/,/\s+/,t],scope:{1:"keyword",3:"title.class"}}]},y={relevance:0,
+ match:n.either(/\bJSON/,/\b[A-Z][a-z]+([A-Z][a-z]*|\d)*/,/\b[A-Z]{2,}([A-Z][a-z]+|\d)+([A-Z][a-z]*)*/,/\b[A-Z]{2,}[a-z]+([A-Z][a-z]+|\d)*([A-Z][a-z]*)*/),
+ className:"title.class",keywords:{_:[...Ee,...ye]}},N={variants:[{
+ match:[/function/,/\s+/,t,/(?=\s*\()/]},{match:[/function/,/\s*(?=\()/]}],
+ className:{1:"keyword",3:"title.function"},label:"func.def",contains:[f],
+ illegal:/%/},w={
+ match:n.concat(/\b/,(v=[...Ne,"super","import"],n.concat("(?!",v.join("|"),")")),t,n.lookahead(/\(/)),
+ className:"title.function",relevance:0};var v;const O={
+ begin:n.concat(/\./,n.lookahead(n.concat(t,/(?![0-9A-Za-z$_(])/))),end:t,
+ excludeBegin:!0,keywords:"prototype",className:"property",relevance:0},k={
+ match:[/get|set/,/\s+/,t,/(?=\()/],className:{1:"keyword",3:"title.function"},
+ contains:[{begin:/\(\)/},f]
+ },x="(\\([^()]*(\\([^()]*(\\([^()]*\\)[^()]*)*\\)[^()]*)*\\)|"+e.UNDERSCORE_IDENT_RE+")\\s*=>",M={
+ match:[/const|var|let/,/\s+/,t,/\s*/,/=\s*/,/(async\s*)?/,n.lookahead(x)],
+ keywords:"async",className:{1:"keyword",3:"title.function"},contains:[f]}
+ ;return{name:"JavaScript",aliases:["js","jsx","mjs","cjs"],keywords:i,exports:{
+ PARAMS_CONTAINS:h,CLASS_REFERENCE:y},illegal:/#(?![$_A-z])/,
+ contains:[e.SHEBANG({label:"shebang",binary:"node",relevance:5}),{
+ label:"use_strict",className:"meta",relevance:10,
+ begin:/^\s*['"]use (strict|asm)['"]/
+ },e.APOS_STRING_MODE,e.QUOTE_STRING_MODE,d,g,u,b,m,{match:/\$\d+/},l,y,{
+ className:"attr",begin:t+n.lookahead(":"),relevance:0},M,{
+ begin:"("+e.RE_STARTERS_RE+"|\\b(case|return|throw)\\b)\\s*",
+ keywords:"return throw case",relevance:0,contains:[m,e.REGEXP_MODE,{
+ className:"function",begin:x,returnBegin:!0,end:"\\s*=>",contains:[{
+ className:"params",variants:[{begin:e.UNDERSCORE_IDENT_RE,relevance:0},{
+ className:null,begin:/\(\s*\)/,skip:!0},{begin:/\(/,end:/\)/,excludeBegin:!0,
+ excludeEnd:!0,keywords:i,contains:h}]}]},{begin:/,/,relevance:0},{match:/\s+/,
+ relevance:0},{variants:[{begin:"<>",end:">"},{
+ match:/<[A-Za-z0-9\\._:-]+\s*\/>/},{begin:a.begin,
+ "on:begin":a.isTrulyOpeningTag,end:a.end}],subLanguage:"xml",contains:[{
+ begin:a.begin,end:a.end,skip:!0,contains:["self"]}]}]},N,{
+ beginKeywords:"while if switch catch for"},{
+ begin:"\\b(?!function)"+e.UNDERSCORE_IDENT_RE+"\\([^()]*(\\([^()]*(\\([^()]*\\)[^()]*)*\\)[^()]*)*\\)\\s*\\{",
+ returnBegin:!0,label:"func.def",contains:[f,e.inherit(e.TITLE_MODE,{begin:t,
+ className:"title.function"})]},{match:/\.\.\./,relevance:0},O,{match:"\\$"+t,
+ relevance:0},{match:[/\bconstructor(?=\s*\()/],className:{1:"title.function"},
+ contains:[f]},w,{relevance:0,match:/\b[A-Z][A-Z_0-9]+\b/,
+ className:"variable.constant"},E,k,{match:/\$[(.]/}]}}
+ const ke=e=>b(/\b/,e,/\w$/.test(e)?/\b/:/\B/),xe=["Protocol","Type"].map(ke),Me=["init","self"].map(ke),Se=["Any","Self"],Ae=["actor","any","associatedtype","async","await",/as\?/,/as!/,"as","borrowing","break","case","catch","class","consume","consuming","continue","convenience","copy","default","defer","deinit","didSet","distributed","do","dynamic","each","else","enum","extension","fallthrough",/fileprivate\(set\)/,"fileprivate","final","for","func","get","guard","if","import","indirect","infix",/init\?/,/init!/,"inout",/internal\(set\)/,"internal","in","is","isolated","nonisolated","lazy","let","macro","mutating","nonmutating",/open\(set\)/,"open","operator","optional","override","postfix","precedencegroup","prefix",/private\(set\)/,"private","protocol",/public\(set\)/,"public","repeat","required","rethrows","return","set","some","static","struct","subscript","super","switch","throws","throw",/try\?/,/try!/,"try","typealias",/unowned\(safe\)/,/unowned\(unsafe\)/,"unowned","var","weak","where","while","willSet"],Ce=["false","nil","true"],Te=["assignment","associativity","higherThan","left","lowerThan","none","right"],Re=["#colorLiteral","#column","#dsohandle","#else","#elseif","#endif","#error","#file","#fileID","#fileLiteral","#filePath","#function","#if","#imageLiteral","#keyPath","#line","#selector","#sourceLocation","#warning"],De=["abs","all","any","assert","assertionFailure","debugPrint","dump","fatalError","getVaList","isKnownUniquelyReferenced","max","min","numericCast","pointwiseMax","pointwiseMin","precondition","preconditionFailure","print","readLine","repeatElement","sequence","stride","swap","swift_unboxFromSwiftValueWithType","transcode","type","unsafeBitCast","unsafeDowncast","withExtendedLifetime","withUnsafeMutablePointer","withUnsafePointer","withVaList","withoutActuallyEscaping","zip"],Ie=m(/[/=\-+!*%<>&|^~?]/,/[\u00A1-\u00A7]/,/[\u00A9\u00AB]/,/[\u00AC\u00AE]/,/[\u00B0\u00B1]/,/[\u00B6\u00BB\u00BF\u00D7\u00F7]/,/[\u2016-\u2017]/,/[\u2020-\u2027]/,/[\u2030-\u203E]/,/[\u2041-\u2053]/,/[\u2055-\u205E]/,/[\u2190-\u23FF]/,/[\u2500-\u2775]/,/[\u2794-\u2BFF]/,/[\u2E00-\u2E7F]/,/[\u3001-\u3003]/,/[\u3008-\u3020]/,/[\u3030]/),Le=m(Ie,/[\u0300-\u036F]/,/[\u1DC0-\u1DFF]/,/[\u20D0-\u20FF]/,/[\uFE00-\uFE0F]/,/[\uFE20-\uFE2F]/),Be=b(Ie,Le,"*"),$e=m(/[a-zA-Z_]/,/[\u00A8\u00AA\u00AD\u00AF\u00B2-\u00B5\u00B7-\u00BA]/,/[\u00BC-\u00BE\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF]/,/[\u0100-\u02FF\u0370-\u167F\u1681-\u180D\u180F-\u1DBF]/,/[\u1E00-\u1FFF]/,/[\u200B-\u200D\u202A-\u202E\u203F-\u2040\u2054\u2060-\u206F]/,/[\u2070-\u20CF\u2100-\u218F\u2460-\u24FF\u2776-\u2793]/,/[\u2C00-\u2DFF\u2E80-\u2FFF]/,/[\u3004-\u3007\u3021-\u302F\u3031-\u303F\u3040-\uD7FF]/,/[\uF900-\uFD3D\uFD40-\uFDCF\uFDF0-\uFE1F\uFE30-\uFE44]/,/[\uFE47-\uFEFE\uFF00-\uFFFD]/),ze=m($e,/\d/,/[\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF\uFE20-\uFE2F]/),Fe=b($e,ze,"*"),Ue=b(/[A-Z]/,ze,"*"),je=["attached","autoclosure",b(/convention\(/,m("swift","block","c"),/\)/),"discardableResult","dynamicCallable","dynamicMemberLookup","escaping","freestanding","frozen","GKInspectable","IBAction","IBDesignable","IBInspectable","IBOutlet","IBSegueAction","inlinable","main","nonobjc","NSApplicationMain","NSCopying","NSManaged",b(/objc\(/,Fe,/\)/),"objc","objcMembers","propertyWrapper","requires_stored_property_inits","resultBuilder","Sendable","testable","UIApplicationMain","unchecked","unknown","usableFromInline","warn_unqualified_access"],Pe=["iOS","iOSApplicationExtension","macOS","macOSApplicationExtension","macCatalyst","macCatalystApplicationExtension","watchOS","watchOSApplicationExtension","tvOS","tvOSApplicationExtension","swift"]
+ ;var Ke=Object.freeze({__proto__:null,grmr_bash:e=>{const n=e.regex,t={},a={
+ begin:/\$\{/,end:/\}/,contains:["self",{begin:/:-/,contains:[t]}]}
+ ;Object.assign(t,{className:"variable",variants:[{
+ begin:n.concat(/\$[\w\d#@][\w\d_]*/,"(?![\\w\\d])(?![$])")},a]});const i={
+ className:"subst",begin:/\$\(/,end:/\)/,contains:[e.BACKSLASH_ESCAPE]},r={
+ begin:/<<-?\s*(?=\w+)/,starts:{contains:[e.END_SAME_AS_BEGIN({begin:/(\w+)/,
+ end:/(\w+)/,className:"string"})]}},s={className:"string",begin:/"/,end:/"/,
+ contains:[e.BACKSLASH_ESCAPE,t,i]};i.contains.push(s);const o={begin:/\$?\(\(/,
+ end:/\)\)/,contains:[{begin:/\d+#[0-9a-f]+/,className:"number"},e.NUMBER_MODE,t]
+ },l=e.SHEBANG({binary:"(fish|bash|zsh|sh|csh|ksh|tcsh|dash|scsh)",relevance:10
+ }),c={className:"function",begin:/\w[\w\d_]*\s*\(\s*\)\s*\{/,returnBegin:!0,
+ contains:[e.inherit(e.TITLE_MODE,{begin:/\w[\w\d_]*/})],relevance:0};return{
+ name:"Bash",aliases:["sh"],keywords:{$pattern:/\b[a-z][a-z0-9._-]+\b/,
+ keyword:["if","then","else","elif","fi","for","while","until","in","do","done","case","esac","function","select"],
+ literal:["true","false"],
+ built_in:["break","cd","continue","eval","exec","exit","export","getopts","hash","pwd","readonly","return","shift","test","times","trap","umask","unset","alias","bind","builtin","caller","command","declare","echo","enable","help","let","local","logout","mapfile","printf","read","readarray","source","type","typeset","ulimit","unalias","set","shopt","autoload","bg","bindkey","bye","cap","chdir","clone","comparguments","compcall","compctl","compdescribe","compfiles","compgroups","compquote","comptags","comptry","compvalues","dirs","disable","disown","echotc","echoti","emulate","fc","fg","float","functions","getcap","getln","history","integer","jobs","kill","limit","log","noglob","popd","print","pushd","pushln","rehash","sched","setcap","setopt","stat","suspend","ttyctl","unfunction","unhash","unlimit","unsetopt","vared","wait","whence","where","which","zcompile","zformat","zftp","zle","zmodload","zparseopts","zprof","zpty","zregexparse","zsocket","zstyle","ztcp","chcon","chgrp","chown","chmod","cp","dd","df","dir","dircolors","ln","ls","mkdir","mkfifo","mknod","mktemp","mv","realpath","rm","rmdir","shred","sync","touch","truncate","vdir","b2sum","base32","base64","cat","cksum","comm","csplit","cut","expand","fmt","fold","head","join","md5sum","nl","numfmt","od","paste","ptx","pr","sha1sum","sha224sum","sha256sum","sha384sum","sha512sum","shuf","sort","split","sum","tac","tail","tr","tsort","unexpand","uniq","wc","arch","basename","chroot","date","dirname","du","echo","env","expr","factor","groups","hostid","id","link","logname","nice","nohup","nproc","pathchk","pinky","printenv","printf","pwd","readlink","runcon","seq","sleep","stat","stdbuf","stty","tee","test","timeout","tty","uname","unlink","uptime","users","who","whoami","yes"]
+ },contains:[l,e.SHEBANG(),c,o,e.HASH_COMMENT_MODE,r,{match:/(\/[a-z._-]+)+/},s,{
+ match:/\\"/},{className:"string",begin:/'/,end:/'/},{match:/\\'/},t]}},
+ grmr_c:e=>{const n=e.regex,t=e.COMMENT("//","$",{contains:[{begin:/\\\n/}]
+ }),a="decltype\\(auto\\)",i="[a-zA-Z_]\\w*::",r="("+a+"|"+n.optional(i)+"[a-zA-Z_]\\w*"+n.optional("<[^<>]+>")+")",s={
+ className:"type",variants:[{begin:"\\b[a-z\\d_]*_t\\b"},{
+ match:/\batomic_[a-z]{3,6}\b/}]},o={className:"string",variants:[{
+ begin:'(u8?|U|L)?"',end:'"',illegal:"\\n",contains:[e.BACKSLASH_ESCAPE]},{
+ begin:"(u8?|U|L)?'(\\\\(x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4,8}|[0-7]{3}|\\S)|.)",
+ end:"'",illegal:"."},e.END_SAME_AS_BEGIN({
+ begin:/(?:u8?|U|L)?R"([^()\\ ]{0,16})\(/,end:/\)([^()\\ ]{0,16})"/})]},l={
+ className:"number",variants:[{begin:"\\b(0b[01']+)"},{
+ begin:"(-?)\\b([\\d']+(\\.[\\d']*)?|\\.[\\d']+)((ll|LL|l|L)(u|U)?|(u|U)(ll|LL|l|L)?|f|F|b|B)"
+ },{
+ begin:"(-?)(\\b0[xX][a-fA-F0-9']+|(\\b[\\d']+(\\.[\\d']*)?|\\.[\\d']+)([eE][-+]?[\\d']+)?)"
+ }],relevance:0},c={className:"meta",begin:/#\s*[a-z]+\b/,end:/$/,keywords:{
+ keyword:"if else elif endif define undef warning error line pragma _Pragma ifdef ifndef include"
+ },contains:[{begin:/\\\n/,relevance:0},e.inherit(o,{className:"string"}),{
+ className:"string",begin:/<.*?>/},t,e.C_BLOCK_COMMENT_MODE]},d={
+ className:"title",begin:n.optional(i)+e.IDENT_RE,relevance:0
+ },g=n.optional(i)+e.IDENT_RE+"\\s*\\(",u={
+ keyword:["asm","auto","break","case","continue","default","do","else","enum","extern","for","fortran","goto","if","inline","register","restrict","return","sizeof","struct","switch","typedef","union","volatile","while","_Alignas","_Alignof","_Atomic","_Generic","_Noreturn","_Static_assert","_Thread_local","alignas","alignof","noreturn","static_assert","thread_local","_Pragma"],
+ type:["float","double","signed","unsigned","int","short","long","char","void","_Bool","_Complex","_Imaginary","_Decimal32","_Decimal64","_Decimal128","const","static","complex","bool","imaginary"],
+ literal:"true false NULL",
+ built_in:"std string wstring cin cout cerr clog stdin stdout stderr stringstream istringstream ostringstream auto_ptr deque list queue stack vector map set pair bitset multiset multimap unordered_set unordered_map unordered_multiset unordered_multimap priority_queue make_pair array shared_ptr abort terminate abs acos asin atan2 atan calloc ceil cosh cos exit exp fabs floor fmod fprintf fputs free frexp fscanf future isalnum isalpha iscntrl isdigit isgraph islower isprint ispunct isspace isupper isxdigit tolower toupper labs ldexp log10 log malloc realloc memchr memcmp memcpy memset modf pow printf putchar puts scanf sinh sin snprintf sprintf sqrt sscanf strcat strchr strcmp strcpy strcspn strlen strncat strncmp strncpy strpbrk strrchr strspn strstr tanh tan vfprintf vprintf vsprintf endl initializer_list unique_ptr"
+ },b=[c,s,t,e.C_BLOCK_COMMENT_MODE,l,o],m={variants:[{begin:/=/,end:/;/},{
+ begin:/\(/,end:/\)/},{beginKeywords:"new throw return else",end:/;/}],
+ keywords:u,contains:b.concat([{begin:/\(/,end:/\)/,keywords:u,
+ contains:b.concat(["self"]),relevance:0}]),relevance:0},p={
+ begin:"("+r+"[\\*&\\s]+)+"+g,returnBegin:!0,end:/[{;=]/,excludeEnd:!0,
+ keywords:u,illegal:/[^\w\s\*&:<>.]/,contains:[{begin:a,keywords:u,relevance:0},{
+ begin:g,returnBegin:!0,contains:[e.inherit(d,{className:"title.function"})],
+ relevance:0},{relevance:0,match:/,/},{className:"params",begin:/\(/,end:/\)/,
+ keywords:u,relevance:0,contains:[t,e.C_BLOCK_COMMENT_MODE,o,l,s,{begin:/\(/,
+ end:/\)/,keywords:u,relevance:0,contains:["self",t,e.C_BLOCK_COMMENT_MODE,o,l,s]
+ }]},s,t,e.C_BLOCK_COMMENT_MODE,c]};return{name:"C",aliases:["h"],keywords:u,
+ disableAutodetect:!0,illegal:"",contains:[].concat(m,p,b,[c,{
+ begin:e.IDENT_RE+"::",keywords:u},{className:"class",
+ beginKeywords:"enum class struct union",end:/[{;:<>=]/,contains:[{
+ beginKeywords:"final class struct"},e.TITLE_MODE]}]),exports:{preprocessor:c,
+ strings:o,keywords:u}}},grmr_cpp:e=>{const n=e.regex,t=e.COMMENT("//","$",{
+ contains:[{begin:/\\\n/}]
+ }),a="decltype\\(auto\\)",i="[a-zA-Z_]\\w*::",r="(?!struct)("+a+"|"+n.optional(i)+"[a-zA-Z_]\\w*"+n.optional("<[^<>]+>")+")",s={
+ className:"type",begin:"\\b[a-z\\d_]*_t\\b"},o={className:"string",variants:[{
+ begin:'(u8?|U|L)?"',end:'"',illegal:"\\n",contains:[e.BACKSLASH_ESCAPE]},{
+ begin:"(u8?|U|L)?'(\\\\(x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4,8}|[0-7]{3}|\\S)|.)",
+ end:"'",illegal:"."},e.END_SAME_AS_BEGIN({
+ begin:/(?:u8?|U|L)?R"([^()\\ ]{0,16})\(/,end:/\)([^()\\ ]{0,16})"/})]},l={
+ className:"number",variants:[{begin:"\\b(0b[01']+)"},{
+ begin:"(-?)\\b([\\d']+(\\.[\\d']*)?|\\.[\\d']+)((ll|LL|l|L)(u|U)?|(u|U)(ll|LL|l|L)?|f|F|b|B)"
+ },{
+ begin:"(-?)(\\b0[xX][a-fA-F0-9']+|(\\b[\\d']+(\\.[\\d']*)?|\\.[\\d']+)([eE][-+]?[\\d']+)?)"
+ }],relevance:0},c={className:"meta",begin:/#\s*[a-z]+\b/,end:/$/,keywords:{
+ keyword:"if else elif endif define undef warning error line pragma _Pragma ifdef ifndef include"
+ },contains:[{begin:/\\\n/,relevance:0},e.inherit(o,{className:"string"}),{
+ className:"string",begin:/<.*?>/},t,e.C_BLOCK_COMMENT_MODE]},d={
+ className:"title",begin:n.optional(i)+e.IDENT_RE,relevance:0
+ },g=n.optional(i)+e.IDENT_RE+"\\s*\\(",u={
+ type:["bool","char","char16_t","char32_t","char8_t","double","float","int","long","short","void","wchar_t","unsigned","signed","const","static"],
+ keyword:["alignas","alignof","and","and_eq","asm","atomic_cancel","atomic_commit","atomic_noexcept","auto","bitand","bitor","break","case","catch","class","co_await","co_return","co_yield","compl","concept","const_cast|10","consteval","constexpr","constinit","continue","decltype","default","delete","do","dynamic_cast|10","else","enum","explicit","export","extern","false","final","for","friend","goto","if","import","inline","module","mutable","namespace","new","noexcept","not","not_eq","nullptr","operator","or","or_eq","override","private","protected","public","reflexpr","register","reinterpret_cast|10","requires","return","sizeof","static_assert","static_cast|10","struct","switch","synchronized","template","this","thread_local","throw","transaction_safe","transaction_safe_dynamic","true","try","typedef","typeid","typename","union","using","virtual","volatile","while","xor","xor_eq"],
+ literal:["NULL","false","nullopt","nullptr","true"],built_in:["_Pragma"],
+ _type_hints:["any","auto_ptr","barrier","binary_semaphore","bitset","complex","condition_variable","condition_variable_any","counting_semaphore","deque","false_type","future","imaginary","initializer_list","istringstream","jthread","latch","lock_guard","multimap","multiset","mutex","optional","ostringstream","packaged_task","pair","promise","priority_queue","queue","recursive_mutex","recursive_timed_mutex","scoped_lock","set","shared_future","shared_lock","shared_mutex","shared_timed_mutex","shared_ptr","stack","string_view","stringstream","timed_mutex","thread","true_type","tuple","unique_lock","unique_ptr","unordered_map","unordered_multimap","unordered_multiset","unordered_set","variant","vector","weak_ptr","wstring","wstring_view"]
+ },b={className:"function.dispatch",relevance:0,keywords:{
+ _hint:["abort","abs","acos","apply","as_const","asin","atan","atan2","calloc","ceil","cerr","cin","clog","cos","cosh","cout","declval","endl","exchange","exit","exp","fabs","floor","fmod","forward","fprintf","fputs","free","frexp","fscanf","future","invoke","isalnum","isalpha","iscntrl","isdigit","isgraph","islower","isprint","ispunct","isspace","isupper","isxdigit","labs","launder","ldexp","log","log10","make_pair","make_shared","make_shared_for_overwrite","make_tuple","make_unique","malloc","memchr","memcmp","memcpy","memset","modf","move","pow","printf","putchar","puts","realloc","scanf","sin","sinh","snprintf","sprintf","sqrt","sscanf","std","stderr","stdin","stdout","strcat","strchr","strcmp","strcpy","strcspn","strlen","strncat","strncmp","strncpy","strpbrk","strrchr","strspn","strstr","swap","tan","tanh","terminate","to_underlying","tolower","toupper","vfprintf","visit","vprintf","vsprintf"]
+ },
+ begin:n.concat(/\b/,/(?!decltype)/,/(?!if)/,/(?!for)/,/(?!switch)/,/(?!while)/,e.IDENT_RE,n.lookahead(/(<[^<>]+>|)\s*\(/))
+ },m=[b,c,s,t,e.C_BLOCK_COMMENT_MODE,l,o],p={variants:[{begin:/=/,end:/;/},{
+ begin:/\(/,end:/\)/},{beginKeywords:"new throw return else",end:/;/}],
+ keywords:u,contains:m.concat([{begin:/\(/,end:/\)/,keywords:u,
+ contains:m.concat(["self"]),relevance:0}]),relevance:0},_={className:"function",
+ begin:"("+r+"[\\*&\\s]+)+"+g,returnBegin:!0,end:/[{;=]/,excludeEnd:!0,
+ keywords:u,illegal:/[^\w\s\*&:<>.]/,contains:[{begin:a,keywords:u,relevance:0},{
+ begin:g,returnBegin:!0,contains:[d],relevance:0},{begin:/::/,relevance:0},{
+ begin:/:/,endsWithParent:!0,contains:[o,l]},{relevance:0,match:/,/},{
+ className:"params",begin:/\(/,end:/\)/,keywords:u,relevance:0,
+ contains:[t,e.C_BLOCK_COMMENT_MODE,o,l,s,{begin:/\(/,end:/\)/,keywords:u,
+ relevance:0,contains:["self",t,e.C_BLOCK_COMMENT_MODE,o,l,s]}]
+ },s,t,e.C_BLOCK_COMMENT_MODE,c]};return{name:"C++",
+ aliases:["cc","c++","h++","hpp","hh","hxx","cxx"],keywords:u,illegal:"",
+ classNameAliases:{"function.dispatch":"built_in"},
+ contains:[].concat(p,_,b,m,[c,{
+ begin:"\\b(deque|list|queue|priority_queue|pair|stack|vector|map|set|bitset|multiset|multimap|unordered_map|unordered_set|unordered_multiset|unordered_multimap|array|tuple|optional|variant|function)\\s*<(?!<)",
+ end:">",keywords:u,contains:["self",s]},{begin:e.IDENT_RE+"::",keywords:u},{
+ match:[/\b(?:enum(?:\s+(?:class|struct))?|class|struct|union)/,/\s+/,/\w+/],
+ className:{1:"keyword",3:"title.class"}}])}},grmr_csharp:e=>{const n={
+ keyword:["abstract","as","base","break","case","catch","class","const","continue","do","else","event","explicit","extern","finally","fixed","for","foreach","goto","if","implicit","in","interface","internal","is","lock","namespace","new","operator","out","override","params","private","protected","public","readonly","record","ref","return","scoped","sealed","sizeof","stackalloc","static","struct","switch","this","throw","try","typeof","unchecked","unsafe","using","virtual","void","volatile","while"].concat(["add","alias","and","ascending","async","await","by","descending","equals","from","get","global","group","init","into","join","let","nameof","not","notnull","on","or","orderby","partial","remove","select","set","unmanaged","value|0","var","when","where","with","yield"]),
+ built_in:["bool","byte","char","decimal","delegate","double","dynamic","enum","float","int","long","nint","nuint","object","sbyte","short","string","ulong","uint","ushort"],
+ literal:["default","false","null","true"]},t=e.inherit(e.TITLE_MODE,{
+ begin:"[a-zA-Z](\\.?\\w)*"}),a={className:"number",variants:[{
+ begin:"\\b(0b[01']+)"},{
+ begin:"(-?)\\b([\\d']+(\\.[\\d']*)?|\\.[\\d']+)(u|U|l|L|ul|UL|f|F|b|B)"},{
+ begin:"(-?)(\\b0[xX][a-fA-F0-9']+|(\\b[\\d']+(\\.[\\d']*)?|\\.[\\d']+)([eE][-+]?[\\d']+)?)"
+ }],relevance:0},i={className:"string",begin:'@"',end:'"',contains:[{begin:'""'}]
+ },r=e.inherit(i,{illegal:/\n/}),s={className:"subst",begin:/\{/,end:/\}/,
+ keywords:n},o=e.inherit(s,{illegal:/\n/}),l={className:"string",begin:/\$"/,
+ end:'"',illegal:/\n/,contains:[{begin:/\{\{/},{begin:/\}\}/
+ },e.BACKSLASH_ESCAPE,o]},c={className:"string",begin:/\$@"/,end:'"',contains:[{
+ begin:/\{\{/},{begin:/\}\}/},{begin:'""'},s]},d=e.inherit(c,{illegal:/\n/,
+ contains:[{begin:/\{\{/},{begin:/\}\}/},{begin:'""'},o]})
+ ;s.contains=[c,l,i,e.APOS_STRING_MODE,e.QUOTE_STRING_MODE,a,e.C_BLOCK_COMMENT_MODE],
+ o.contains=[d,l,r,e.APOS_STRING_MODE,e.QUOTE_STRING_MODE,a,e.inherit(e.C_BLOCK_COMMENT_MODE,{
+ illegal:/\n/})];const g={variants:[c,l,i,e.APOS_STRING_MODE,e.QUOTE_STRING_MODE]
+ },u={begin:"<",end:">",contains:[{beginKeywords:"in out"},t]
+ },b=e.IDENT_RE+"(<"+e.IDENT_RE+"(\\s*,\\s*"+e.IDENT_RE+")*>)?(\\[\\])?",m={
+ begin:"@"+e.IDENT_RE,relevance:0};return{name:"C#",aliases:["cs","c#"],
+ keywords:n,illegal:/::/,contains:[e.COMMENT("///","$",{returnBegin:!0,
+ contains:[{className:"doctag",variants:[{begin:"///",relevance:0},{
+ begin:"\x3c!--|--\x3e"},{begin:"?",end:">"}]}]
+ }),e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE,{className:"meta",begin:"#",
+ end:"$",keywords:{
+ keyword:"if else elif endif define undef warning error line region endregion pragma checksum"
+ }},g,a,{beginKeywords:"class interface",relevance:0,end:/[{;=]/,
+ illegal:/[^\s:,]/,contains:[{beginKeywords:"where class"
+ },t,u,e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE]},{beginKeywords:"namespace",
+ relevance:0,end:/[{;=]/,illegal:/[^\s:]/,
+ contains:[t,e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE]},{
+ beginKeywords:"record",relevance:0,end:/[{;=]/,illegal:/[^\s:]/,
+ contains:[t,u,e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE]},{className:"meta",
+ begin:"^\\s*\\[(?=[\\w])",excludeBegin:!0,end:"\\]",excludeEnd:!0,contains:[{
+ className:"string",begin:/"/,end:/"/}]},{
+ beginKeywords:"new return throw await else",relevance:0},{className:"function",
+ begin:"("+b+"\\s+)+"+e.IDENT_RE+"\\s*(<[^=]+>\\s*)?\\(",returnBegin:!0,
+ end:/\s*[{;=]/,excludeEnd:!0,keywords:n,contains:[{
+ beginKeywords:"public private protected static internal protected abstract async extern override unsafe virtual new sealed partial",
+ relevance:0},{begin:e.IDENT_RE+"\\s*(<[^=]+>\\s*)?\\(",returnBegin:!0,
+ contains:[e.TITLE_MODE,u],relevance:0},{match:/\(\)/},{className:"params",
+ begin:/\(/,end:/\)/,excludeBegin:!0,excludeEnd:!0,keywords:n,relevance:0,
+ contains:[g,a,e.C_BLOCK_COMMENT_MODE]
+ },e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE]},m]}},grmr_css:e=>{
+ const n=e.regex,t=ie(e),a=[e.APOS_STRING_MODE,e.QUOTE_STRING_MODE];return{
+ name:"CSS",case_insensitive:!0,illegal:/[=|'\$]/,keywords:{
+ keyframePosition:"from to"},classNameAliases:{keyframePosition:"selector-tag"},
+ contains:[t.BLOCK_COMMENT,{begin:/-(webkit|moz|ms|o)-(?=[a-z])/
+ },t.CSS_NUMBER_MODE,{className:"selector-id",begin:/#[A-Za-z0-9_-]+/,relevance:0
+ },{className:"selector-class",begin:"\\.[a-zA-Z-][a-zA-Z0-9_-]*",relevance:0
+ },t.ATTRIBUTE_SELECTOR_MODE,{className:"selector-pseudo",variants:[{
+ begin:":("+oe.join("|")+")"},{begin:":(:)?("+le.join("|")+")"}]
+ },t.CSS_VARIABLE,{className:"attribute",begin:"\\b("+ce.join("|")+")\\b"},{
+ begin:/:/,end:/[;}{]/,
+ contains:[t.BLOCK_COMMENT,t.HEXCOLOR,t.IMPORTANT,t.CSS_NUMBER_MODE,...a,{
+ begin:/(url|data-uri)\(/,end:/\)/,relevance:0,keywords:{built_in:"url data-uri"
+ },contains:[...a,{className:"string",begin:/[^)]/,endsWithParent:!0,
+ excludeEnd:!0}]},t.FUNCTION_DISPATCH]},{begin:n.lookahead(/@/),end:"[{;]",
+ relevance:0,illegal:/:/,contains:[{className:"keyword",begin:/@-?\w[\w]*(-\w+)*/
+ },{begin:/\s/,endsWithParent:!0,excludeEnd:!0,relevance:0,keywords:{
+ $pattern:/[a-z-]+/,keyword:"and or not only",attribute:se.join(" ")},contains:[{
+ begin:/[a-z-]+(?=:)/,className:"attribute"},...a,t.CSS_NUMBER_MODE]}]},{
+ className:"selector-tag",begin:"\\b("+re.join("|")+")\\b"}]}},grmr_diff:e=>{
+ const n=e.regex;return{name:"Diff",aliases:["patch"],contains:[{
+ className:"meta",relevance:10,
+ match:n.either(/^@@ +-\d+,\d+ +\+\d+,\d+ +@@/,/^\*\*\* +\d+,\d+ +\*\*\*\*$/,/^--- +\d+,\d+ +----$/)
+ },{className:"comment",variants:[{
+ begin:n.either(/Index: /,/^index/,/={3,}/,/^-{3}/,/^\*{3} /,/^\+{3}/,/^diff --git/),
+ end:/$/},{match:/^\*{15}$/}]},{className:"addition",begin:/^\+/,end:/$/},{
+ className:"deletion",begin:/^-/,end:/$/},{className:"addition",begin:/^!/,
+ end:/$/}]}},grmr_go:e=>{const n={
+ keyword:["break","case","chan","const","continue","default","defer","else","fallthrough","for","func","go","goto","if","import","interface","map","package","range","return","select","struct","switch","type","var"],
+ type:["bool","byte","complex64","complex128","error","float32","float64","int8","int16","int32","int64","string","uint8","uint16","uint32","uint64","int","uint","uintptr","rune"],
+ literal:["true","false","iota","nil"],
+ built_in:["append","cap","close","complex","copy","imag","len","make","new","panic","print","println","real","recover","delete"]
+ };return{name:"Go",aliases:["golang"],keywords:n,illegal:"",
+ contains:[e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE,{className:"string",
+ variants:[e.QUOTE_STRING_MODE,e.APOS_STRING_MODE,{begin:"`",end:"`"}]},{
+ className:"number",variants:[{begin:e.C_NUMBER_RE+"[i]",relevance:1
+ },e.C_NUMBER_MODE]},{begin:/:=/},{className:"function",beginKeywords:"func",
+ end:"\\s*(\\{|$)",excludeEnd:!0,contains:[e.TITLE_MODE,{className:"params",
+ begin:/\(/,end:/\)/,endsParent:!0,keywords:n,illegal:/["']/}]}]}},
+ grmr_graphql:e=>{const n=e.regex;return{name:"GraphQL",aliases:["gql"],
+ case_insensitive:!0,disableAutodetect:!1,keywords:{
+ keyword:["query","mutation","subscription","type","input","schema","directive","interface","union","scalar","fragment","enum","on"],
+ literal:["true","false","null"]},
+ contains:[e.HASH_COMMENT_MODE,e.QUOTE_STRING_MODE,e.NUMBER_MODE,{
+ scope:"punctuation",match:/[.]{3}/,relevance:0},{scope:"punctuation",
+ begin:/[\!\(\)\:\=\[\]\{\|\}]{1}/,relevance:0},{scope:"variable",begin:/\$/,
+ end:/\W/,excludeEnd:!0,relevance:0},{scope:"meta",match:/@\w+/,excludeEnd:!0},{
+ scope:"symbol",begin:n.concat(/[_A-Za-z][_0-9A-Za-z]*/,n.lookahead(/\s*:/)),
+ relevance:0}],illegal:[/[;<']/,/BEGIN/]}},grmr_ini:e=>{const n=e.regex,t={
+ className:"number",relevance:0,variants:[{begin:/([+-]+)?[\d]+_[\d_]+/},{
+ begin:e.NUMBER_RE}]},a=e.COMMENT();a.variants=[{begin:/;/,end:/$/},{begin:/#/,
+ end:/$/}];const i={className:"variable",variants:[{begin:/\$[\w\d"][\w\d_]*/},{
+ begin:/\$\{(.*?)\}/}]},r={className:"literal",
+ begin:/\bon|off|true|false|yes|no\b/},s={className:"string",
+ contains:[e.BACKSLASH_ESCAPE],variants:[{begin:"'''",end:"'''",relevance:10},{
+ begin:'"""',end:'"""',relevance:10},{begin:'"',end:'"'},{begin:"'",end:"'"}]
+ },o={begin:/\[/,end:/\]/,contains:[a,r,i,s,t,"self"],relevance:0
+ },l=n.either(/[A-Za-z0-9_-]+/,/"(\\"|[^"])*"/,/'[^']*'/);return{
+ name:"TOML, also INI",aliases:["toml"],case_insensitive:!0,illegal:/\S/,
+ contains:[a,{className:"section",begin:/\[+/,end:/\]+/},{
+ begin:n.concat(l,"(\\s*\\.\\s*",l,")*",n.lookahead(/\s*=\s*[^#\s]/)),
+ className:"attr",starts:{end:/$/,contains:[a,o,r,i,s,t]}}]}},grmr_java:e=>{
+ const n=e.regex,t="[\xc0-\u02b8a-zA-Z_$][\xc0-\u02b8a-zA-Z_$0-9]*",a=t+pe("(?:<"+t+"~~~(?:\\s*,\\s*"+t+"~~~)*>)?",/~~~/g,2),i={
+ keyword:["synchronized","abstract","private","var","static","if","const ","for","while","strictfp","finally","protected","import","native","final","void","enum","else","break","transient","catch","instanceof","volatile","case","assert","package","default","public","try","switch","continue","throws","protected","public","private","module","requires","exports","do","sealed","yield","permits"],
+ literal:["false","true","null"],
+ type:["char","boolean","long","float","int","byte","short","double"],
+ built_in:["super","this"]},r={className:"meta",begin:"@"+t,contains:[{
+ begin:/\(/,end:/\)/,contains:["self"]}]},s={className:"params",begin:/\(/,
+ end:/\)/,keywords:i,relevance:0,contains:[e.C_BLOCK_COMMENT_MODE],endsParent:!0}
+ ;return{name:"Java",aliases:["jsp"],keywords:i,illegal:/<\/|#/,
+ contains:[e.COMMENT("/\\*\\*","\\*/",{relevance:0,contains:[{begin:/\w+@/,
+ relevance:0},{className:"doctag",begin:"@[A-Za-z]+"}]}),{
+ begin:/import java\.[a-z]+\./,keywords:"import",relevance:2
+ },e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE,{begin:/"""/,end:/"""/,
+ className:"string",contains:[e.BACKSLASH_ESCAPE]
+ },e.APOS_STRING_MODE,e.QUOTE_STRING_MODE,{
+ match:[/\b(?:class|interface|enum|extends|implements|new)/,/\s+/,t],className:{
+ 1:"keyword",3:"title.class"}},{match:/non-sealed/,scope:"keyword"},{
+ begin:[n.concat(/(?!else)/,t),/\s+/,t,/\s+/,/=(?!=)/],className:{1:"type",
+ 3:"variable",5:"operator"}},{begin:[/record/,/\s+/,t],className:{1:"keyword",
+ 3:"title.class"},contains:[s,e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE]},{
+ beginKeywords:"new throw return else",relevance:0},{
+ begin:["(?:"+a+"\\s+)",e.UNDERSCORE_IDENT_RE,/\s*(?=\()/],className:{
+ 2:"title.function"},keywords:i,contains:[{className:"params",begin:/\(/,
+ end:/\)/,keywords:i,relevance:0,
+ contains:[r,e.APOS_STRING_MODE,e.QUOTE_STRING_MODE,me,e.C_BLOCK_COMMENT_MODE]
+ },e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE]},me,r]}},grmr_javascript:Oe,
+ grmr_json:e=>{const n=["true","false","null"],t={scope:"literal",
+ beginKeywords:n.join(" ")};return{name:"JSON",keywords:{literal:n},contains:[{
+ className:"attr",begin:/"(\\.|[^\\"\r\n])*"(?=\s*:)/,relevance:1.01},{
+ match:/[{}[\],:]/,className:"punctuation",relevance:0
+ },e.QUOTE_STRING_MODE,t,e.C_NUMBER_MODE,e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE],
+ illegal:"\\S"}},grmr_kotlin:e=>{const n={
+ keyword:"abstract as val var vararg get set class object open private protected public noinline crossinline dynamic final enum if else do while for when throw try catch finally import package is in fun override companion reified inline lateinit init interface annotation data sealed internal infix operator out by constructor super tailrec where const inner suspend typealias external expect actual",
+ built_in:"Byte Short Char Int Long Boolean Float Double Void Unit Nothing",
+ literal:"true false null"},t={className:"symbol",begin:e.UNDERSCORE_IDENT_RE+"@"
+ },a={className:"subst",begin:/\$\{/,end:/\}/,contains:[e.C_NUMBER_MODE]},i={
+ className:"variable",begin:"\\$"+e.UNDERSCORE_IDENT_RE},r={className:"string",
+ variants:[{begin:'"""',end:'"""(?=[^"])',contains:[i,a]},{begin:"'",end:"'",
+ illegal:/\n/,contains:[e.BACKSLASH_ESCAPE]},{begin:'"',end:'"',illegal:/\n/,
+ contains:[e.BACKSLASH_ESCAPE,i,a]}]};a.contains.push(r);const s={
+ className:"meta",
+ begin:"@(?:file|property|field|get|set|receiver|param|setparam|delegate)\\s*:(?:\\s*"+e.UNDERSCORE_IDENT_RE+")?"
+ },o={className:"meta",begin:"@"+e.UNDERSCORE_IDENT_RE,contains:[{begin:/\(/,
+ end:/\)/,contains:[e.inherit(r,{className:"string"}),"self"]}]
+ },l=me,c=e.COMMENT("/\\*","\\*/",{contains:[e.C_BLOCK_COMMENT_MODE]}),d={
+ variants:[{className:"type",begin:e.UNDERSCORE_IDENT_RE},{begin:/\(/,end:/\)/,
+ contains:[]}]},g=d;return g.variants[1].contains=[d],d.variants[1].contains=[g],
+ {name:"Kotlin",aliases:["kt","kts"],keywords:n,
+ contains:[e.COMMENT("/\\*\\*","\\*/",{relevance:0,contains:[{className:"doctag",
+ begin:"@[A-Za-z]+"}]}),e.C_LINE_COMMENT_MODE,c,{className:"keyword",
+ begin:/\b(break|continue|return|this)\b/,starts:{contains:[{className:"symbol",
+ begin:/@\w+/}]}},t,s,o,{className:"function",beginKeywords:"fun",end:"[(]|$",
+ returnBegin:!0,excludeEnd:!0,keywords:n,relevance:5,contains:[{
+ begin:e.UNDERSCORE_IDENT_RE+"\\s*\\(",returnBegin:!0,relevance:0,
+ contains:[e.UNDERSCORE_TITLE_MODE]},{className:"type",begin:/,end:/>/,
+ keywords:"reified",relevance:0},{className:"params",begin:/\(/,end:/\)/,
+ endsParent:!0,keywords:n,relevance:0,contains:[{begin:/:/,end:/[=,\/]/,
+ endsWithParent:!0,contains:[d,e.C_LINE_COMMENT_MODE,c],relevance:0
+ },e.C_LINE_COMMENT_MODE,c,s,o,r,e.C_NUMBER_MODE]},c]},{
+ begin:[/class|interface|trait/,/\s+/,e.UNDERSCORE_IDENT_RE],beginScope:{
+ 3:"title.class"},keywords:"class interface trait",end:/[:\{(]|$/,excludeEnd:!0,
+ illegal:"extends implements",contains:[{
+ beginKeywords:"public protected internal private constructor"
+ },e.UNDERSCORE_TITLE_MODE,{className:"type",begin:/,end:/>/,excludeBegin:!0,
+ excludeEnd:!0,relevance:0},{className:"type",begin:/[,:]\s*/,end:/[<\(,){\s]|$/,
+ excludeBegin:!0,returnEnd:!0},s,o]},r,{className:"meta",begin:"^#!/usr/bin/env",
+ end:"$",illegal:"\n"},l]}},grmr_less:e=>{
+ const n=ie(e),t=de,a="[\\w-]+",i="("+a+"|@\\{"+a+"\\})",r=[],s=[],o=e=>({
+ className:"string",begin:"~?"+e+".*?"+e}),l=(e,n,t)=>({className:e,begin:n,
+ relevance:t}),c={$pattern:/[a-z-]+/,keyword:"and or not only",
+ attribute:se.join(" ")},d={begin:"\\(",end:"\\)",contains:s,keywords:c,
+ relevance:0}
+ ;s.push(e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE,o("'"),o('"'),n.CSS_NUMBER_MODE,{
+ begin:"(url|data-uri)\\(",starts:{className:"string",end:"[\\)\\n]",
+ excludeEnd:!0}
+ },n.HEXCOLOR,d,l("variable","@@?"+a,10),l("variable","@\\{"+a+"\\}"),l("built_in","~?`[^`]*?`"),{
+ className:"attribute",begin:a+"\\s*:",end:":",returnBegin:!0,excludeEnd:!0
+ },n.IMPORTANT,{beginKeywords:"and not"},n.FUNCTION_DISPATCH);const g=s.concat({
+ begin:/\{/,end:/\}/,contains:r}),u={beginKeywords:"when",endsWithParent:!0,
+ contains:[{beginKeywords:"and not"}].concat(s)},b={begin:i+"\\s*:",
+ returnBegin:!0,end:/[;}]/,relevance:0,contains:[{begin:/-(webkit|moz|ms|o)-/
+ },n.CSS_VARIABLE,{className:"attribute",begin:"\\b("+ce.join("|")+")\\b",
+ end:/(?=:)/,starts:{endsWithParent:!0,illegal:"[<=$]",relevance:0,contains:s}}]
+ },m={className:"keyword",
+ begin:"@(import|media|charset|font-face|(-[a-z]+-)?keyframes|supports|document|namespace|page|viewport|host)\\b",
+ starts:{end:"[;{}]",keywords:c,returnEnd:!0,contains:s,relevance:0}},p={
+ className:"variable",variants:[{begin:"@"+a+"\\s*:",relevance:15},{begin:"@"+a
+ }],starts:{end:"[;}]",returnEnd:!0,contains:g}},_={variants:[{
+ begin:"[\\.#:&\\[>]",end:"[;{}]"},{begin:i,end:/\{/}],returnBegin:!0,
+ returnEnd:!0,illegal:"[<='$\"]",relevance:0,
+ contains:[e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE,u,l("keyword","all\\b"),l("variable","@\\{"+a+"\\}"),{
+ begin:"\\b("+re.join("|")+")\\b",className:"selector-tag"
+ },n.CSS_NUMBER_MODE,l("selector-tag",i,0),l("selector-id","#"+i),l("selector-class","\\."+i,0),l("selector-tag","&",0),n.ATTRIBUTE_SELECTOR_MODE,{
+ className:"selector-pseudo",begin:":("+oe.join("|")+")"},{
+ className:"selector-pseudo",begin:":(:)?("+le.join("|")+")"},{begin:/\(/,
+ end:/\)/,relevance:0,contains:g},{begin:"!important"},n.FUNCTION_DISPATCH]},h={
+ begin:a+":(:)?"+`(${t.join("|")})`,returnBegin:!0,contains:[_]}
+ ;return r.push(e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE,m,p,h,b,_,u,n.FUNCTION_DISPATCH),
+ {name:"Less",case_insensitive:!0,illegal:"[=>'/<($\"]",contains:r}},
+ grmr_lua:e=>{const n="\\[=*\\[",t="\\]=*\\]",a={begin:n,end:t,contains:["self"]
+ },i=[e.COMMENT("--(?!"+n+")","$"),e.COMMENT("--"+n,t,{contains:[a],relevance:10
+ })];return{name:"Lua",keywords:{$pattern:e.UNDERSCORE_IDENT_RE,
+ literal:"true false nil",
+ keyword:"and break do else elseif end for goto if in local not or repeat return then until while",
+ built_in:"_G _ENV _VERSION __index __newindex __mode __call __metatable __tostring __len __gc __add __sub __mul __div __mod __pow __concat __unm __eq __lt __le assert collectgarbage dofile error getfenv getmetatable ipairs load loadfile loadstring module next pairs pcall print rawequal rawget rawset require select setfenv setmetatable tonumber tostring type unpack xpcall arg self coroutine resume yield status wrap create running debug getupvalue debug sethook getmetatable gethook setmetatable setlocal traceback setfenv getinfo setupvalue getlocal getregistry getfenv io lines write close flush open output type read stderr stdin input stdout popen tmpfile math log max acos huge ldexp pi cos tanh pow deg tan cosh sinh random randomseed frexp ceil floor rad abs sqrt modf asin min mod fmod log10 atan2 exp sin atan os exit setlocale date getenv difftime remove time clock tmpname rename execute package preload loadlib loaded loaders cpath config path seeall string sub upper len gfind rep find match char dump gmatch reverse byte format gsub lower table setn insert getn foreachi maxn foreach concat sort remove"
+ },contains:i.concat([{className:"function",beginKeywords:"function",end:"\\)",
+ contains:[e.inherit(e.TITLE_MODE,{
+ begin:"([_a-zA-Z]\\w*\\.)*([_a-zA-Z]\\w*:)?[_a-zA-Z]\\w*"}),{className:"params",
+ begin:"\\(",endsWithParent:!0,contains:i}].concat(i)
+ },e.C_NUMBER_MODE,e.APOS_STRING_MODE,e.QUOTE_STRING_MODE,{className:"string",
+ begin:n,end:t,contains:[a],relevance:5}])}},grmr_makefile:e=>{const n={
+ className:"variable",variants:[{begin:"\\$\\("+e.UNDERSCORE_IDENT_RE+"\\)",
+ contains:[e.BACKSLASH_ESCAPE]},{begin:/\$[@%\^\+\*]/}]},t={className:"string",
+ begin:/"/,end:/"/,contains:[e.BACKSLASH_ESCAPE,n]},a={className:"variable",
+ begin:/\$\([\w-]+\s/,end:/\)/,keywords:{
+ built_in:"subst patsubst strip findstring filter filter-out sort word wordlist firstword lastword dir notdir suffix basename addsuffix addprefix join wildcard realpath abspath error warning shell origin flavor foreach if or and call eval file value"
+ },contains:[n]},i={begin:"^"+e.UNDERSCORE_IDENT_RE+"\\s*(?=[:+?]?=)"},r={
+ className:"section",begin:/^[^\s]+:/,end:/$/,contains:[n]};return{
+ name:"Makefile",aliases:["mk","mak","make"],keywords:{$pattern:/[\w-]+/,
+ keyword:"define endef undefine ifdef ifndef ifeq ifneq else endif include -include sinclude override export unexport private vpath"
+ },contains:[e.HASH_COMMENT_MODE,n,t,a,i,{className:"meta",begin:/^\.PHONY:/,
+ end:/$/,keywords:{$pattern:/[\.\w]+/,keyword:".PHONY"}},r]}},grmr_markdown:e=>{
+ const n={begin:/<\/?[A-Za-z_]/,end:">",subLanguage:"xml",relevance:0},t={
+ variants:[{begin:/\[.+?\]\[.*?\]/,relevance:0},{
+ begin:/\[.+?\]\(((data|javascript|mailto):|(?:http|ftp)s?:\/\/).*?\)/,
+ relevance:2},{
+ begin:e.regex.concat(/\[.+?\]\(/,/[A-Za-z][A-Za-z0-9+.-]*/,/:\/\/.*?\)/),
+ relevance:2},{begin:/\[.+?\]\([./?].*?\)/,relevance:1},{
+ begin:/\[.*?\]\(.*?\)/,relevance:0}],returnBegin:!0,contains:[{match:/\[(?=\])/
+ },{className:"string",relevance:0,begin:"\\[",end:"\\]",excludeBegin:!0,
+ returnEnd:!0},{className:"link",relevance:0,begin:"\\]\\(",end:"\\)",
+ excludeBegin:!0,excludeEnd:!0},{className:"symbol",relevance:0,begin:"\\]\\[",
+ end:"\\]",excludeBegin:!0,excludeEnd:!0}]},a={className:"strong",contains:[],
+ variants:[{begin:/_{2}(?!\s)/,end:/_{2}/},{begin:/\*{2}(?!\s)/,end:/\*{2}/}]
+ },i={className:"emphasis",contains:[],variants:[{begin:/\*(?![*\s])/,end:/\*/},{
+ begin:/_(?![_\s])/,end:/_/,relevance:0}]},r=e.inherit(a,{contains:[]
+ }),s=e.inherit(i,{contains:[]});a.contains.push(s),i.contains.push(r)
+ ;let o=[n,t];return[a,i,r,s].forEach((e=>{e.contains=e.contains.concat(o)
+ })),o=o.concat(a,i),{name:"Markdown",aliases:["md","mkdown","mkd"],contains:[{
+ className:"section",variants:[{begin:"^#{1,6}",end:"$",contains:o},{
+ begin:"(?=^.+?\\n[=-]{2,}$)",contains:[{begin:"^[=-]*$"},{begin:"^",end:"\\n",
+ contains:o}]}]},n,{className:"bullet",begin:"^[ \t]*([*+-]|(\\d+\\.))(?=\\s+)",
+ end:"\\s+",excludeEnd:!0},a,i,{className:"quote",begin:"^>\\s+",contains:o,
+ end:"$"},{className:"code",variants:[{begin:"(`{3,})[^`](.|\\n)*?\\1`*[ ]*"},{
+ begin:"(~{3,})[^~](.|\\n)*?\\1~*[ ]*"},{begin:"```",end:"```+[ ]*$"},{
+ begin:"~~~",end:"~~~+[ ]*$"},{begin:"`.+?`"},{begin:"(?=^( {4}|\\t))",
+ contains:[{begin:"^( {4}|\\t)",end:"(\\n)$"}],relevance:0}]},{
+ begin:"^[-\\*]{3,}",end:"$"},t,{begin:/^\[[^\n]+\]:/,returnBegin:!0,contains:[{
+ className:"symbol",begin:/\[/,end:/\]/,excludeBegin:!0,excludeEnd:!0},{
+ className:"link",begin:/:\s*/,end:/$/,excludeBegin:!0}]}]}},grmr_objectivec:e=>{
+ const n=/[a-zA-Z@][a-zA-Z0-9_]*/,t={$pattern:n,
+ keyword:["@interface","@class","@protocol","@implementation"]};return{
+ name:"Objective-C",aliases:["mm","objc","obj-c","obj-c++","objective-c++"],
+ keywords:{"variable.language":["this","super"],$pattern:n,
+ keyword:["while","export","sizeof","typedef","const","struct","for","union","volatile","static","mutable","if","do","return","goto","enum","else","break","extern","asm","case","default","register","explicit","typename","switch","continue","inline","readonly","assign","readwrite","self","@synchronized","id","typeof","nonatomic","IBOutlet","IBAction","strong","weak","copy","in","out","inout","bycopy","byref","oneway","__strong","__weak","__block","__autoreleasing","@private","@protected","@public","@try","@property","@end","@throw","@catch","@finally","@autoreleasepool","@synthesize","@dynamic","@selector","@optional","@required","@encode","@package","@import","@defs","@compatibility_alias","__bridge","__bridge_transfer","__bridge_retained","__bridge_retain","__covariant","__contravariant","__kindof","_Nonnull","_Nullable","_Null_unspecified","__FUNCTION__","__PRETTY_FUNCTION__","__attribute__","getter","setter","retain","unsafe_unretained","nonnull","nullable","null_unspecified","null_resettable","class","instancetype","NS_DESIGNATED_INITIALIZER","NS_UNAVAILABLE","NS_REQUIRES_SUPER","NS_RETURNS_INNER_POINTER","NS_INLINE","NS_AVAILABLE","NS_DEPRECATED","NS_ENUM","NS_OPTIONS","NS_SWIFT_UNAVAILABLE","NS_ASSUME_NONNULL_BEGIN","NS_ASSUME_NONNULL_END","NS_REFINED_FOR_SWIFT","NS_SWIFT_NAME","NS_SWIFT_NOTHROW","NS_DURING","NS_HANDLER","NS_ENDHANDLER","NS_VALUERETURN","NS_VOIDRETURN"],
+ literal:["false","true","FALSE","TRUE","nil","YES","NO","NULL"],
+ built_in:["dispatch_once_t","dispatch_queue_t","dispatch_sync","dispatch_async","dispatch_once"],
+ type:["int","float","char","unsigned","signed","short","long","double","wchar_t","unichar","void","bool","BOOL","id|0","_Bool"]
+ },illegal:"",contains:[{className:"built_in",
+ begin:"\\b(AV|CA|CF|CG|CI|CL|CM|CN|CT|MK|MP|MTK|MTL|NS|SCN|SK|UI|WK|XC)\\w+"
+ },e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE,e.C_NUMBER_MODE,e.QUOTE_STRING_MODE,e.APOS_STRING_MODE,{
+ className:"string",variants:[{begin:'@"',end:'"',illegal:"\\n",
+ contains:[e.BACKSLASH_ESCAPE]}]},{className:"meta",begin:/#\s*[a-z]+\b/,end:/$/,
+ keywords:{
+ keyword:"if else elif endif define undef warning error line pragma ifdef ifndef include"
+ },contains:[{begin:/\\\n/,relevance:0},e.inherit(e.QUOTE_STRING_MODE,{
+ className:"string"}),{className:"string",begin:/<.*?>/,end:/$/,illegal:"\\n"
+ },e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE]},{className:"class",
+ begin:"("+t.keyword.join("|")+")\\b",end:/(\{|$)/,excludeEnd:!0,keywords:t,
+ contains:[e.UNDERSCORE_TITLE_MODE]},{begin:"\\."+e.UNDERSCORE_IDENT_RE,
+ relevance:0}]}},grmr_perl:e=>{const n=e.regex,t=/[dualxmsipngr]{0,12}/,a={
+ $pattern:/[\w.]+/,
+ keyword:"abs accept alarm and atan2 bind binmode bless break caller chdir chmod chomp chop chown chr chroot close closedir connect continue cos crypt dbmclose dbmopen defined delete die do dump each else elsif endgrent endhostent endnetent endprotoent endpwent endservent eof eval exec exists exit exp fcntl fileno flock for foreach fork format formline getc getgrent getgrgid getgrnam gethostbyaddr gethostbyname gethostent getlogin getnetbyaddr getnetbyname getnetent getpeername getpgrp getpriority getprotobyname getprotobynumber getprotoent getpwent getpwnam getpwuid getservbyname getservbyport getservent getsockname getsockopt given glob gmtime goto grep gt hex if index int ioctl join keys kill last lc lcfirst length link listen local localtime log lstat lt ma map mkdir msgctl msgget msgrcv msgsnd my ne next no not oct open opendir or ord our pack package pipe pop pos print printf prototype push q|0 qq quotemeta qw qx rand read readdir readline readlink readpipe recv redo ref rename require reset return reverse rewinddir rindex rmdir say scalar seek seekdir select semctl semget semop send setgrent sethostent setnetent setpgrp setpriority setprotoent setpwent setservent setsockopt shift shmctl shmget shmread shmwrite shutdown sin sleep socket socketpair sort splice split sprintf sqrt srand stat state study sub substr symlink syscall sysopen sysread sysseek system syswrite tell telldir tie tied time times tr truncate uc ucfirst umask undef unless unlink unpack unshift untie until use utime values vec wait waitpid wantarray warn when while write x|0 xor y|0"
+ },i={className:"subst",begin:"[$@]\\{",end:"\\}",keywords:a},r={begin:/->\{/,
+ end:/\}/},s={variants:[{begin:/\$\d/},{
+ begin:n.concat(/[$%@](\^\w\b|#\w+(::\w+)*|\{\w+\}|\w+(::\w*)*)/,"(?![A-Za-z])(?![@$%])")
+ },{begin:/[$%@][^\s\w{]/,relevance:0}]
+ },o=[e.BACKSLASH_ESCAPE,i,s],l=[/!/,/\//,/\|/,/\?/,/'/,/"/,/#/],c=(e,a,i="\\1")=>{
+ const r="\\1"===i?i:n.concat(i,a)
+ ;return n.concat(n.concat("(?:",e,")"),a,/(?:\\.|[^\\\/])*?/,r,/(?:\\.|[^\\\/])*?/,i,t)
+ },d=(e,a,i)=>n.concat(n.concat("(?:",e,")"),a,/(?:\\.|[^\\\/])*?/,i,t),g=[s,e.HASH_COMMENT_MODE,e.COMMENT(/^=\w/,/=cut/,{
+ endsWithParent:!0}),r,{className:"string",contains:o,variants:[{
+ begin:"q[qwxr]?\\s*\\(",end:"\\)",relevance:5},{begin:"q[qwxr]?\\s*\\[",
+ end:"\\]",relevance:5},{begin:"q[qwxr]?\\s*\\{",end:"\\}",relevance:5},{
+ begin:"q[qwxr]?\\s*\\|",end:"\\|",relevance:5},{begin:"q[qwxr]?\\s*<",end:">",
+ relevance:5},{begin:"qw\\s+q",end:"q",relevance:5},{begin:"'",end:"'",
+ contains:[e.BACKSLASH_ESCAPE]},{begin:'"',end:'"'},{begin:"`",end:"`",
+ contains:[e.BACKSLASH_ESCAPE]},{begin:/\{\w+\}/,relevance:0},{
+ begin:"-?\\w+\\s*=>",relevance:0}]},{className:"number",
+ begin:"(\\b0[0-7_]+)|(\\b0x[0-9a-fA-F_]+)|(\\b[1-9][0-9_]*(\\.[0-9_]+)?)|[0_]\\b",
+ relevance:0},{
+ begin:"(\\/\\/|"+e.RE_STARTERS_RE+"|\\b(split|return|print|reverse|grep)\\b)\\s*",
+ keywords:"split return print reverse grep",relevance:0,
+ contains:[e.HASH_COMMENT_MODE,{className:"regexp",variants:[{
+ begin:c("s|tr|y",n.either(...l,{capture:!0}))},{begin:c("s|tr|y","\\(","\\)")},{
+ begin:c("s|tr|y","\\[","\\]")},{begin:c("s|tr|y","\\{","\\}")}],relevance:2},{
+ className:"regexp",variants:[{begin:/(m|qr)\/\//,relevance:0},{
+ begin:d("(?:m|qr)?",/\//,/\//)},{begin:d("m|qr",n.either(...l,{capture:!0
+ }),/\1/)},{begin:d("m|qr",/\(/,/\)/)},{begin:d("m|qr",/\[/,/\]/)},{
+ begin:d("m|qr",/\{/,/\}/)}]}]},{className:"function",beginKeywords:"sub",
+ end:"(\\s*\\(.*?\\))?[;{]",excludeEnd:!0,relevance:5,contains:[e.TITLE_MODE]},{
+ begin:"-\\w\\b",relevance:0},{begin:"^__DATA__$",end:"^__END__$",
+ subLanguage:"mojolicious",contains:[{begin:"^@@.*",end:"$",className:"comment"}]
+ }];return i.contains=g,r.contains=g,{name:"Perl",aliases:["pl","pm"],keywords:a,
+ contains:g}},grmr_php:e=>{
+ const n=e.regex,t=/(?![A-Za-z0-9])(?![$])/,a=n.concat(/[a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*/,t),i=n.concat(/(\\?[A-Z][a-z0-9_\x7f-\xff]+|\\?[A-Z]+(?=[A-Z][a-z0-9_\x7f-\xff])){1,}/,t),r={
+ scope:"variable",match:"\\$+"+a},s={scope:"subst",variants:[{begin:/\$\w+/},{
+ begin:/\{\$/,end:/\}/}]},o=e.inherit(e.APOS_STRING_MODE,{illegal:null
+ }),l="[ \t\n]",c={scope:"string",variants:[e.inherit(e.QUOTE_STRING_MODE,{
+ illegal:null,contains:e.QUOTE_STRING_MODE.contains.concat(s)}),o,{
+ begin:/<<<[ \t]*(?:(\w+)|"(\w+)")\n/,end:/[ \t]*(\w+)\b/,
+ contains:e.QUOTE_STRING_MODE.contains.concat(s),"on:begin":(e,n)=>{
+ n.data._beginMatch=e[1]||e[2]},"on:end":(e,n)=>{
+ n.data._beginMatch!==e[1]&&n.ignoreMatch()}},e.END_SAME_AS_BEGIN({
+ begin:/<<<[ \t]*'(\w+)'\n/,end:/[ \t]*(\w+)\b/})]},d={scope:"number",variants:[{
+ begin:"\\b0[bB][01]+(?:_[01]+)*\\b"},{begin:"\\b0[oO][0-7]+(?:_[0-7]+)*\\b"},{
+ begin:"\\b0[xX][\\da-fA-F]+(?:_[\\da-fA-F]+)*\\b"},{
+ begin:"(?:\\b\\d+(?:_\\d+)*(\\.(?:\\d+(?:_\\d+)*))?|\\B\\.\\d+)(?:[eE][+-]?\\d+)?"
+ }],relevance:0
+ },g=["false","null","true"],u=["__CLASS__","__DIR__","__FILE__","__FUNCTION__","__COMPILER_HALT_OFFSET__","__LINE__","__METHOD__","__NAMESPACE__","__TRAIT__","die","echo","exit","include","include_once","print","require","require_once","array","abstract","and","as","binary","bool","boolean","break","callable","case","catch","class","clone","const","continue","declare","default","do","double","else","elseif","empty","enddeclare","endfor","endforeach","endif","endswitch","endwhile","enum","eval","extends","final","finally","float","for","foreach","from","global","goto","if","implements","instanceof","insteadof","int","integer","interface","isset","iterable","list","match|0","mixed","new","never","object","or","private","protected","public","readonly","real","return","string","switch","throw","trait","try","unset","use","var","void","while","xor","yield"],b=["Error|0","AppendIterator","ArgumentCountError","ArithmeticError","ArrayIterator","ArrayObject","AssertionError","BadFunctionCallException","BadMethodCallException","CachingIterator","CallbackFilterIterator","CompileError","Countable","DirectoryIterator","DivisionByZeroError","DomainException","EmptyIterator","ErrorException","Exception","FilesystemIterator","FilterIterator","GlobIterator","InfiniteIterator","InvalidArgumentException","IteratorIterator","LengthException","LimitIterator","LogicException","MultipleIterator","NoRewindIterator","OutOfBoundsException","OutOfRangeException","OuterIterator","OverflowException","ParentIterator","ParseError","RangeException","RecursiveArrayIterator","RecursiveCachingIterator","RecursiveCallbackFilterIterator","RecursiveDirectoryIterator","RecursiveFilterIterator","RecursiveIterator","RecursiveIteratorIterator","RecursiveRegexIterator","RecursiveTreeIterator","RegexIterator","RuntimeException","SeekableIterator","SplDoublyLinkedList","SplFileInfo","SplFileObject","SplFixedArray","SplHeap","SplMaxHeap","SplMinHeap","SplObjectStorage","SplObserver","SplPriorityQueue","SplQueue","SplStack","SplSubject","SplTempFileObject","TypeError","UnderflowException","UnexpectedValueException","UnhandledMatchError","ArrayAccess","BackedEnum","Closure","Fiber","Generator","Iterator","IteratorAggregate","Serializable","Stringable","Throwable","Traversable","UnitEnum","WeakReference","WeakMap","Directory","__PHP_Incomplete_Class","parent","php_user_filter","self","static","stdClass"],m={
+ keyword:u,literal:(e=>{const n=[];return e.forEach((e=>{
+ n.push(e),e.toLowerCase()===e?n.push(e.toUpperCase()):n.push(e.toLowerCase())
+ })),n})(g),built_in:b},p=e=>e.map((e=>e.replace(/\|\d+$/,""))),_={variants:[{
+ match:[/new/,n.concat(l,"+"),n.concat("(?!",p(b).join("\\b|"),"\\b)"),i],scope:{
+ 1:"keyword",4:"title.class"}}]},h=n.concat(a,"\\b(?!\\()"),f={variants:[{
+ match:[n.concat(/::/,n.lookahead(/(?!class\b)/)),h],scope:{2:"variable.constant"
+ }},{match:[/::/,/class/],scope:{2:"variable.language"}},{
+ match:[i,n.concat(/::/,n.lookahead(/(?!class\b)/)),h],scope:{1:"title.class",
+ 3:"variable.constant"}},{match:[i,n.concat("::",n.lookahead(/(?!class\b)/))],
+ scope:{1:"title.class"}},{match:[i,/::/,/class/],scope:{1:"title.class",
+ 3:"variable.language"}}]},E={scope:"attr",
+ match:n.concat(a,n.lookahead(":"),n.lookahead(/(?!::)/))},y={relevance:0,
+ begin:/\(/,end:/\)/,keywords:m,contains:[E,r,f,e.C_BLOCK_COMMENT_MODE,c,d,_]
+ },N={relevance:0,
+ match:[/\b/,n.concat("(?!fn\\b|function\\b|",p(u).join("\\b|"),"|",p(b).join("\\b|"),"\\b)"),a,n.concat(l,"*"),n.lookahead(/(?=\()/)],
+ scope:{3:"title.function.invoke"},contains:[y]};y.contains.push(N)
+ ;const w=[E,f,e.C_BLOCK_COMMENT_MODE,c,d,_];return{case_insensitive:!1,
+ keywords:m,contains:[{begin:n.concat(/#\[\s*/,i),beginScope:"meta",end:/]/,
+ endScope:"meta",keywords:{literal:g,keyword:["new","array"]},contains:[{
+ begin:/\[/,end:/]/,keywords:{literal:g,keyword:["new","array"]},
+ contains:["self",...w]},...w,{scope:"meta",match:i}]
+ },e.HASH_COMMENT_MODE,e.COMMENT("//","$"),e.COMMENT("/\\*","\\*/",{contains:[{
+ scope:"doctag",match:"@[A-Za-z]+"}]}),{match:/__halt_compiler\(\);/,
+ keywords:"__halt_compiler",starts:{scope:"comment",end:e.MATCH_NOTHING_RE,
+ contains:[{match:/\?>/,scope:"meta",endsParent:!0}]}},{scope:"meta",variants:[{
+ begin:/<\?php/,relevance:10},{begin:/<\?=/},{begin:/<\?/,relevance:.1},{
+ begin:/\?>/}]},{scope:"variable.language",match:/\$this\b/},r,N,f,{
+ match:[/const/,/\s/,a],scope:{1:"keyword",3:"variable.constant"}},_,{
+ scope:"function",relevance:0,beginKeywords:"fn function",end:/[;{]/,
+ excludeEnd:!0,illegal:"[$%\\[]",contains:[{beginKeywords:"use"
+ },e.UNDERSCORE_TITLE_MODE,{begin:"=>",endsParent:!0},{scope:"params",
+ begin:"\\(",end:"\\)",excludeBegin:!0,excludeEnd:!0,keywords:m,
+ contains:["self",r,f,e.C_BLOCK_COMMENT_MODE,c,d]}]},{scope:"class",variants:[{
+ beginKeywords:"enum",illegal:/[($"]/},{beginKeywords:"class interface trait",
+ illegal:/[:($"]/}],relevance:0,end:/\{/,excludeEnd:!0,contains:[{
+ beginKeywords:"extends implements"},e.UNDERSCORE_TITLE_MODE]},{
+ beginKeywords:"namespace",relevance:0,end:";",illegal:/[.']/,
+ contains:[e.inherit(e.UNDERSCORE_TITLE_MODE,{scope:"title.class"})]},{
+ beginKeywords:"use",relevance:0,end:";",contains:[{
+ match:/\b(as|const|function)\b/,scope:"keyword"},e.UNDERSCORE_TITLE_MODE]},c,d]}
+ },grmr_php_template:e=>({name:"PHP template",subLanguage:"xml",contains:[{
+ begin:/<\?(php|=)?/,end:/\?>/,subLanguage:"php",contains:[{begin:"/\\*",
+ end:"\\*/",skip:!0},{begin:'b"',end:'"',skip:!0},{begin:"b'",end:"'",skip:!0
+ },e.inherit(e.APOS_STRING_MODE,{illegal:null,className:null,contains:null,
+ skip:!0}),e.inherit(e.QUOTE_STRING_MODE,{illegal:null,className:null,
+ contains:null,skip:!0})]}]}),grmr_plaintext:e=>({name:"Plain text",
+ aliases:["text","txt"],disableAutodetect:!0}),grmr_python:e=>{
+ const n=e.regex,t=/[\p{XID_Start}_]\p{XID_Continue}*/u,a=["and","as","assert","async","await","break","case","class","continue","def","del","elif","else","except","finally","for","from","global","if","import","in","is","lambda","match","nonlocal|10","not","or","pass","raise","return","try","while","with","yield"],i={
+ $pattern:/[A-Za-z]\w+|__\w+__/,keyword:a,
+ built_in:["__import__","abs","all","any","ascii","bin","bool","breakpoint","bytearray","bytes","callable","chr","classmethod","compile","complex","delattr","dict","dir","divmod","enumerate","eval","exec","filter","float","format","frozenset","getattr","globals","hasattr","hash","help","hex","id","input","int","isinstance","issubclass","iter","len","list","locals","map","max","memoryview","min","next","object","oct","open","ord","pow","print","property","range","repr","reversed","round","set","setattr","slice","sorted","staticmethod","str","sum","super","tuple","type","vars","zip"],
+ literal:["__debug__","Ellipsis","False","None","NotImplemented","True"],
+ type:["Any","Callable","Coroutine","Dict","List","Literal","Generic","Optional","Sequence","Set","Tuple","Type","Union"]
+ },r={className:"meta",begin:/^(>>>|\.\.\.) /},s={className:"subst",begin:/\{/,
+ end:/\}/,keywords:i,illegal:/#/},o={begin:/\{\{/,relevance:0},l={
+ className:"string",contains:[e.BACKSLASH_ESCAPE],variants:[{
+ begin:/([uU]|[bB]|[rR]|[bB][rR]|[rR][bB])?'''/,end:/'''/,
+ contains:[e.BACKSLASH_ESCAPE,r],relevance:10},{
+ begin:/([uU]|[bB]|[rR]|[bB][rR]|[rR][bB])?"""/,end:/"""/,
+ contains:[e.BACKSLASH_ESCAPE,r],relevance:10},{
+ begin:/([fF][rR]|[rR][fF]|[fF])'''/,end:/'''/,
+ contains:[e.BACKSLASH_ESCAPE,r,o,s]},{begin:/([fF][rR]|[rR][fF]|[fF])"""/,
+ end:/"""/,contains:[e.BACKSLASH_ESCAPE,r,o,s]},{begin:/([uU]|[rR])'/,end:/'/,
+ relevance:10},{begin:/([uU]|[rR])"/,end:/"/,relevance:10},{
+ begin:/([bB]|[bB][rR]|[rR][bB])'/,end:/'/},{begin:/([bB]|[bB][rR]|[rR][bB])"/,
+ end:/"/},{begin:/([fF][rR]|[rR][fF]|[fF])'/,end:/'/,
+ contains:[e.BACKSLASH_ESCAPE,o,s]},{begin:/([fF][rR]|[rR][fF]|[fF])"/,end:/"/,
+ contains:[e.BACKSLASH_ESCAPE,o,s]},e.APOS_STRING_MODE,e.QUOTE_STRING_MODE]
+ },c="[0-9](_?[0-9])*",d=`(\\b(${c}))?\\.(${c})|\\b(${c})\\.`,g="\\b|"+a.join("|"),u={
+ className:"number",relevance:0,variants:[{
+ begin:`(\\b(${c})|(${d}))[eE][+-]?(${c})[jJ]?(?=${g})`},{begin:`(${d})[jJ]?`},{
+ begin:`\\b([1-9](_?[0-9])*|0+(_?0)*)[lLjJ]?(?=${g})`},{
+ begin:`\\b0[bB](_?[01])+[lL]?(?=${g})`},{begin:`\\b0[oO](_?[0-7])+[lL]?(?=${g})`
+ },{begin:`\\b0[xX](_?[0-9a-fA-F])+[lL]?(?=${g})`},{begin:`\\b(${c})[jJ](?=${g})`
+ }]},b={className:"comment",begin:n.lookahead(/# type:/),end:/$/,keywords:i,
+ contains:[{begin:/# type:/},{begin:/#/,end:/\b\B/,endsWithParent:!0}]},m={
+ className:"params",variants:[{className:"",begin:/\(\s*\)/,skip:!0},{begin:/\(/,
+ end:/\)/,excludeBegin:!0,excludeEnd:!0,keywords:i,
+ contains:["self",r,u,l,e.HASH_COMMENT_MODE]}]};return s.contains=[l,u,r],{
+ name:"Python",aliases:["py","gyp","ipython"],unicodeRegex:!0,keywords:i,
+ illegal:/(<\/|\?)|=>/,contains:[r,u,{begin:/\bself\b/},{beginKeywords:"if",
+ relevance:0},l,b,e.HASH_COMMENT_MODE,{match:[/\bdef/,/\s+/,t],scope:{
+ 1:"keyword",3:"title.function"},contains:[m]},{variants:[{
+ match:[/\bclass/,/\s+/,t,/\s*/,/\(\s*/,t,/\s*\)/]},{match:[/\bclass/,/\s+/,t]}],
+ scope:{1:"keyword",3:"title.class",6:"title.class.inherited"}},{
+ className:"meta",begin:/^[\t ]*@/,end:/(?=#)|$/,contains:[u,m,l]}]}},
+ grmr_python_repl:e=>({aliases:["pycon"],contains:[{className:"meta.prompt",
+ starts:{end:/ |$/,starts:{end:"$",subLanguage:"python"}},variants:[{
+ begin:/^>>>(?=[ ]|$)/},{begin:/^\.\.\.(?=[ ]|$)/}]}]}),grmr_r:e=>{
+ const n=e.regex,t=/(?:(?:[a-zA-Z]|\.[._a-zA-Z])[._a-zA-Z0-9]*)|\.(?!\d)/,a=n.either(/0[xX][0-9a-fA-F]+\.[0-9a-fA-F]*[pP][+-]?\d+i?/,/0[xX][0-9a-fA-F]+(?:[pP][+-]?\d+)?[Li]?/,/(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?[Li]?/),i=/[=!<>:]=|\|\||&&|:::?|<-|<<-|->>|->|\|>|[-+*\/?!$&|:<=>@^~]|\*\*/,r=n.either(/[()]/,/[{}]/,/\[\[/,/[[\]]/,/\\/,/,/)
+ ;return{name:"R",keywords:{$pattern:t,
+ keyword:"function if in break next repeat else for while",
+ literal:"NULL NA TRUE FALSE Inf NaN NA_integer_|10 NA_real_|10 NA_character_|10 NA_complex_|10",
+ built_in:"LETTERS letters month.abb month.name pi T F abs acos acosh all any anyNA Arg as.call as.character as.complex as.double as.environment as.integer as.logical as.null.default as.numeric as.raw asin asinh atan atanh attr attributes baseenv browser c call ceiling class Conj cos cosh cospi cummax cummin cumprod cumsum digamma dim dimnames emptyenv exp expression floor forceAndCall gamma gc.time globalenv Im interactive invisible is.array is.atomic is.call is.character is.complex is.double is.environment is.expression is.finite is.function is.infinite is.integer is.language is.list is.logical is.matrix is.na is.name is.nan is.null is.numeric is.object is.pairlist is.raw is.recursive is.single is.symbol lazyLoadDBfetch length lgamma list log max min missing Mod names nargs nzchar oldClass on.exit pos.to.env proc.time prod quote range Re rep retracemem return round seq_along seq_len seq.int sign signif sin sinh sinpi sqrt standardGeneric substitute sum switch tan tanh tanpi tracemem trigamma trunc unclass untracemem UseMethod xtfrm"
+ },contains:[e.COMMENT(/#'/,/$/,{contains:[{scope:"doctag",match:/@examples/,
+ starts:{end:n.lookahead(n.either(/\n^#'\s*(?=@[a-zA-Z]+)/,/\n^(?!#')/)),
+ endsParent:!0}},{scope:"doctag",begin:"@param",end:/$/,contains:[{
+ scope:"variable",variants:[{match:t},{match:/`(?:\\.|[^`\\])+`/}],endsParent:!0
+ }]},{scope:"doctag",match:/@[a-zA-Z]+/},{scope:"keyword",match:/\\[a-zA-Z]+/}]
+ }),e.HASH_COMMENT_MODE,{scope:"string",contains:[e.BACKSLASH_ESCAPE],
+ variants:[e.END_SAME_AS_BEGIN({begin:/[rR]"(-*)\(/,end:/\)(-*)"/
+ }),e.END_SAME_AS_BEGIN({begin:/[rR]"(-*)\{/,end:/\}(-*)"/
+ }),e.END_SAME_AS_BEGIN({begin:/[rR]"(-*)\[/,end:/\](-*)"/
+ }),e.END_SAME_AS_BEGIN({begin:/[rR]'(-*)\(/,end:/\)(-*)'/
+ }),e.END_SAME_AS_BEGIN({begin:/[rR]'(-*)\{/,end:/\}(-*)'/
+ }),e.END_SAME_AS_BEGIN({begin:/[rR]'(-*)\[/,end:/\](-*)'/}),{begin:'"',end:'"',
+ relevance:0},{begin:"'",end:"'",relevance:0}]},{relevance:0,variants:[{scope:{
+ 1:"operator",2:"number"},match:[i,a]},{scope:{1:"operator",2:"number"},
+ match:[/%[^%]*%/,a]},{scope:{1:"punctuation",2:"number"},match:[r,a]},{scope:{
+ 2:"number"},match:[/[^a-zA-Z0-9._]|^/,a]}]},{scope:{3:"operator"},
+ match:[t,/\s+/,/<-/,/\s+/]},{scope:"operator",relevance:0,variants:[{match:i},{
+ match:/%[^%]*%/}]},{scope:"punctuation",relevance:0,match:r},{begin:"`",end:"`",
+ contains:[{begin:/\\./}]}]}},grmr_ruby:e=>{
+ const n=e.regex,t="([a-zA-Z_]\\w*[!?=]?|[-+~]@|<<|>>|=~|===?|<=>|[<>]=?|\\*\\*|[-/+%^&*~`|]|\\[\\]=?)",a=n.either(/\b([A-Z]+[a-z0-9]+)+/,/\b([A-Z]+[a-z0-9]+)+[A-Z]+/),i=n.concat(a,/(::\w+)*/),r={
+ "variable.constant":["__FILE__","__LINE__","__ENCODING__"],
+ "variable.language":["self","super"],
+ keyword:["alias","and","begin","BEGIN","break","case","class","defined","do","else","elsif","end","END","ensure","for","if","in","module","next","not","or","redo","require","rescue","retry","return","then","undef","unless","until","when","while","yield","include","extend","prepend","public","private","protected","raise","throw"],
+ built_in:["proc","lambda","attr_accessor","attr_reader","attr_writer","define_method","private_constant","module_function"],
+ literal:["true","false","nil"]},s={className:"doctag",begin:"@[A-Za-z]+"},o={
+ begin:"#<",end:">"},l=[e.COMMENT("#","$",{contains:[s]
+ }),e.COMMENT("^=begin","^=end",{contains:[s],relevance:10
+ }),e.COMMENT("^__END__",e.MATCH_NOTHING_RE)],c={className:"subst",begin:/#\{/,
+ end:/\}/,keywords:r},d={className:"string",contains:[e.BACKSLASH_ESCAPE,c],
+ variants:[{begin:/'/,end:/'/},{begin:/"/,end:/"/},{begin:/`/,end:/`/},{
+ begin:/%[qQwWx]?\(/,end:/\)/},{begin:/%[qQwWx]?\[/,end:/\]/},{
+ begin:/%[qQwWx]?\{/,end:/\}/},{begin:/%[qQwWx]?,end:/>/},{begin:/%[qQwWx]?\//,
+ end:/\//},{begin:/%[qQwWx]?%/,end:/%/},{begin:/%[qQwWx]?-/,end:/-/},{
+ begin:/%[qQwWx]?\|/,end:/\|/},{begin:/\B\?(\\\d{1,3})/},{
+ begin:/\B\?(\\x[A-Fa-f0-9]{1,2})/},{begin:/\B\?(\\u\{?[A-Fa-f0-9]{1,6}\}?)/},{
+ begin:/\B\?(\\M-\\C-|\\M-\\c|\\c\\M-|\\M-|\\C-\\M-)[\x20-\x7e]/},{
+ begin:/\B\?\\(c|C-)[\x20-\x7e]/},{begin:/\B\?\\?\S/},{
+ begin:n.concat(/<<[-~]?'?/,n.lookahead(/(\w+)(?=\W)[^\n]*\n(?:[^\n]*\n)*?\s*\1\b/)),
+ contains:[e.END_SAME_AS_BEGIN({begin:/(\w+)/,end:/(\w+)/,
+ contains:[e.BACKSLASH_ESCAPE,c]})]}]},g="[0-9](_?[0-9])*",u={className:"number",
+ relevance:0,variants:[{
+ begin:`\\b([1-9](_?[0-9])*|0)(\\.(${g}))?([eE][+-]?(${g})|r)?i?\\b`},{
+ begin:"\\b0[dD][0-9](_?[0-9])*r?i?\\b"},{begin:"\\b0[bB][0-1](_?[0-1])*r?i?\\b"
+ },{begin:"\\b0[oO][0-7](_?[0-7])*r?i?\\b"},{
+ begin:"\\b0[xX][0-9a-fA-F](_?[0-9a-fA-F])*r?i?\\b"},{
+ begin:"\\b0(_?[0-7])+r?i?\\b"}]},b={variants:[{match:/\(\)/},{
+ className:"params",begin:/\(/,end:/(?=\))/,excludeBegin:!0,endsParent:!0,
+ keywords:r}]},m=[d,{variants:[{match:[/class\s+/,i,/\s+<\s+/,i]},{
+ match:[/\b(class|module)\s+/,i]}],scope:{2:"title.class",
+ 4:"title.class.inherited"},keywords:r},{match:[/(include|extend)\s+/,i],scope:{
+ 2:"title.class"},keywords:r},{relevance:0,match:[i,/\.new[. (]/],scope:{
+ 1:"title.class"}},{relevance:0,match:/\b[A-Z][A-Z_0-9]+\b/,
+ className:"variable.constant"},{relevance:0,match:a,scope:"title.class"},{
+ match:[/def/,/\s+/,t],scope:{1:"keyword",3:"title.function"},contains:[b]},{
+ begin:e.IDENT_RE+"::"},{className:"symbol",
+ begin:e.UNDERSCORE_IDENT_RE+"(!|\\?)?:",relevance:0},{className:"symbol",
+ begin:":(?!\\s)",contains:[d,{begin:t}],relevance:0},u,{className:"variable",
+ begin:"(\\$\\W)|((\\$|@@?)(\\w+))(?=[^@$?])(?![A-Za-z])(?![@$?'])"},{
+ className:"params",begin:/\|/,end:/\|/,excludeBegin:!0,excludeEnd:!0,
+ relevance:0,keywords:r},{begin:"("+e.RE_STARTERS_RE+"|unless)\\s*",
+ keywords:"unless",contains:[{className:"regexp",contains:[e.BACKSLASH_ESCAPE,c],
+ illegal:/\n/,variants:[{begin:"/",end:"/[a-z]*"},{begin:/%r\{/,end:/\}[a-z]*/},{
+ begin:"%r\\(",end:"\\)[a-z]*"},{begin:"%r!",end:"![a-z]*"},{begin:"%r\\[",
+ end:"\\][a-z]*"}]}].concat(o,l),relevance:0}].concat(o,l)
+ ;c.contains=m,b.contains=m;const p=[{begin:/^\s*=>/,starts:{end:"$",contains:m}
+ },{className:"meta.prompt",
+ begin:"^([>?]>|[\\w#]+\\(\\w+\\):\\d+:\\d+[>*]|(\\w+-)?\\d+\\.\\d+\\.\\d+(p\\d+)?[^\\d][^>]+>)(?=[ ])",
+ starts:{end:"$",keywords:r,contains:m}}];return l.unshift(o),{name:"Ruby",
+ aliases:["rb","gemspec","podspec","thor","irb"],keywords:r,illegal:/\/\*/,
+ contains:[e.SHEBANG({binary:"ruby"})].concat(p).concat(l).concat(m)}},
+ grmr_rust:e=>{const n=e.regex,t={className:"title.function.invoke",relevance:0,
+ begin:n.concat(/\b/,/(?!let|for|while|if|else|match\b)/,e.IDENT_RE,n.lookahead(/\s*\(/))
+ },a="([ui](8|16|32|64|128|size)|f(32|64))?",i=["drop ","Copy","Send","Sized","Sync","Drop","Fn","FnMut","FnOnce","ToOwned","Clone","Debug","PartialEq","PartialOrd","Eq","Ord","AsRef","AsMut","Into","From","Default","Iterator","Extend","IntoIterator","DoubleEndedIterator","ExactSizeIterator","SliceConcatExt","ToString","assert!","assert_eq!","bitflags!","bytes!","cfg!","col!","concat!","concat_idents!","debug_assert!","debug_assert_eq!","env!","eprintln!","panic!","file!","format!","format_args!","include_bytes!","include_str!","line!","local_data_key!","module_path!","option_env!","print!","println!","select!","stringify!","try!","unimplemented!","unreachable!","vec!","write!","writeln!","macro_rules!","assert_ne!","debug_assert_ne!"],r=["i8","i16","i32","i64","i128","isize","u8","u16","u32","u64","u128","usize","f32","f64","str","char","bool","Box","Option","Result","String","Vec"]
+ ;return{name:"Rust",aliases:["rs"],keywords:{$pattern:e.IDENT_RE+"!?",type:r,
+ keyword:["abstract","as","async","await","become","box","break","const","continue","crate","do","dyn","else","enum","extern","false","final","fn","for","if","impl","in","let","loop","macro","match","mod","move","mut","override","priv","pub","ref","return","self","Self","static","struct","super","trait","true","try","type","typeof","unsafe","unsized","use","virtual","where","while","yield"],
+ literal:["true","false","Some","None","Ok","Err"],built_in:i},illegal:"",
+ contains:[e.C_LINE_COMMENT_MODE,e.COMMENT("/\\*","\\*/",{contains:["self"]
+ }),e.inherit(e.QUOTE_STRING_MODE,{begin:/b?"/,illegal:null}),{
+ className:"string",variants:[{begin:/b?r(#*)"(.|\n)*?"\1(?!#)/},{
+ begin:/b?'\\?(x\w{2}|u\w{4}|U\w{8}|.)'/}]},{className:"symbol",
+ begin:/'[a-zA-Z_][a-zA-Z0-9_]*/},{className:"number",variants:[{
+ begin:"\\b0b([01_]+)"+a},{begin:"\\b0o([0-7_]+)"+a},{
+ begin:"\\b0x([A-Fa-f0-9_]+)"+a},{
+ begin:"\\b(\\d[\\d_]*(\\.[0-9_]+)?([eE][+-]?[0-9_]+)?)"+a}],relevance:0},{
+ begin:[/fn/,/\s+/,e.UNDERSCORE_IDENT_RE],className:{1:"keyword",
+ 3:"title.function"}},{className:"meta",begin:"#!?\\[",end:"\\]",contains:[{
+ className:"string",begin:/"/,end:/"/}]},{
+ begin:[/let/,/\s+/,/(?:mut\s+)?/,e.UNDERSCORE_IDENT_RE],className:{1:"keyword",
+ 3:"keyword",4:"variable"}},{
+ begin:[/for/,/\s+/,e.UNDERSCORE_IDENT_RE,/\s+/,/in/],className:{1:"keyword",
+ 3:"variable",5:"keyword"}},{begin:[/type/,/\s+/,e.UNDERSCORE_IDENT_RE],
+ className:{1:"keyword",3:"title.class"}},{
+ begin:[/(?:trait|enum|struct|union|impl|for)/,/\s+/,e.UNDERSCORE_IDENT_RE],
+ className:{1:"keyword",3:"title.class"}},{begin:e.IDENT_RE+"::",keywords:{
+ keyword:"Self",built_in:i,type:r}},{className:"punctuation",begin:"->"},t]}},
+ grmr_scss:e=>{const n=ie(e),t=le,a=oe,i="@[a-z-]+",r={className:"variable",
+ begin:"(\\$[a-zA-Z-][a-zA-Z0-9_-]*)\\b",relevance:0};return{name:"SCSS",
+ case_insensitive:!0,illegal:"[=/|']",
+ contains:[e.C_LINE_COMMENT_MODE,e.C_BLOCK_COMMENT_MODE,n.CSS_NUMBER_MODE,{
+ className:"selector-id",begin:"#[A-Za-z0-9_-]+",relevance:0},{
+ className:"selector-class",begin:"\\.[A-Za-z0-9_-]+",relevance:0
+ },n.ATTRIBUTE_SELECTOR_MODE,{className:"selector-tag",
+ begin:"\\b("+re.join("|")+")\\b",relevance:0},{className:"selector-pseudo",
+ begin:":("+a.join("|")+")"},{className:"selector-pseudo",
+ begin:":(:)?("+t.join("|")+")"},r,{begin:/\(/,end:/\)/,
+ contains:[n.CSS_NUMBER_MODE]},n.CSS_VARIABLE,{className:"attribute",
+ begin:"\\b("+ce.join("|")+")\\b"},{
+ begin:"\\b(whitespace|wait|w-resize|visible|vertical-text|vertical-ideographic|uppercase|upper-roman|upper-alpha|underline|transparent|top|thin|thick|text|text-top|text-bottom|tb-rl|table-header-group|table-footer-group|sw-resize|super|strict|static|square|solid|small-caps|separate|se-resize|scroll|s-resize|rtl|row-resize|ridge|right|repeat|repeat-y|repeat-x|relative|progress|pointer|overline|outside|outset|oblique|nowrap|not-allowed|normal|none|nw-resize|no-repeat|no-drop|newspaper|ne-resize|n-resize|move|middle|medium|ltr|lr-tb|lowercase|lower-roman|lower-alpha|loose|list-item|line|line-through|line-edge|lighter|left|keep-all|justify|italic|inter-word|inter-ideograph|inside|inset|inline|inline-block|inherit|inactive|ideograph-space|ideograph-parenthesis|ideograph-numeric|ideograph-alpha|horizontal|hidden|help|hand|groove|fixed|ellipsis|e-resize|double|dotted|distribute|distribute-space|distribute-letter|distribute-all-lines|disc|disabled|default|decimal|dashed|crosshair|collapse|col-resize|circle|char|center|capitalize|break-word|break-all|bottom|both|bolder|bold|block|bidi-override|below|baseline|auto|always|all-scroll|absolute|table|table-cell)\\b"
+ },{begin:/:/,end:/[;}{]/,relevance:0,
+ contains:[n.BLOCK_COMMENT,r,n.HEXCOLOR,n.CSS_NUMBER_MODE,e.QUOTE_STRING_MODE,e.APOS_STRING_MODE,n.IMPORTANT,n.FUNCTION_DISPATCH]
+ },{begin:"@(page|font-face)",keywords:{$pattern:i,keyword:"@page @font-face"}},{
+ begin:"@",end:"[{;]",returnBegin:!0,keywords:{$pattern:/[a-z-]+/,
+ keyword:"and or not only",attribute:se.join(" ")},contains:[{begin:i,
+ className:"keyword"},{begin:/[a-z-]+(?=:)/,className:"attribute"
+ },r,e.QUOTE_STRING_MODE,e.APOS_STRING_MODE,n.HEXCOLOR,n.CSS_NUMBER_MODE]
+ },n.FUNCTION_DISPATCH]}},grmr_shell:e=>({name:"Shell Session",
+ aliases:["console","shellsession"],contains:[{className:"meta.prompt",
+ begin:/^\s{0,3}[/~\w\d[\]()@-]*[>%$#][ ]?/,starts:{end:/[^\\](?=\s*$)/,
+ subLanguage:"bash"}}]}),grmr_sql:e=>{
+ const n=e.regex,t=e.COMMENT("--","$"),a=["true","false","unknown"],i=["bigint","binary","blob","boolean","char","character","clob","date","dec","decfloat","decimal","float","int","integer","interval","nchar","nclob","national","numeric","real","row","smallint","time","timestamp","varchar","varying","varbinary"],r=["abs","acos","array_agg","asin","atan","avg","cast","ceil","ceiling","coalesce","corr","cos","cosh","count","covar_pop","covar_samp","cume_dist","dense_rank","deref","element","exp","extract","first_value","floor","json_array","json_arrayagg","json_exists","json_object","json_objectagg","json_query","json_table","json_table_primitive","json_value","lag","last_value","lead","listagg","ln","log","log10","lower","max","min","mod","nth_value","ntile","nullif","percent_rank","percentile_cont","percentile_disc","position","position_regex","power","rank","regr_avgx","regr_avgy","regr_count","regr_intercept","regr_r2","regr_slope","regr_sxx","regr_sxy","regr_syy","row_number","sin","sinh","sqrt","stddev_pop","stddev_samp","substring","substring_regex","sum","tan","tanh","translate","translate_regex","treat","trim","trim_array","unnest","upper","value_of","var_pop","var_samp","width_bucket"],s=["create table","insert into","primary key","foreign key","not null","alter table","add constraint","grouping sets","on overflow","character set","respect nulls","ignore nulls","nulls first","nulls last","depth first","breadth first"],o=r,l=["abs","acos","all","allocate","alter","and","any","are","array","array_agg","array_max_cardinality","as","asensitive","asin","asymmetric","at","atan","atomic","authorization","avg","begin","begin_frame","begin_partition","between","bigint","binary","blob","boolean","both","by","call","called","cardinality","cascaded","case","cast","ceil","ceiling","char","char_length","character","character_length","check","classifier","clob","close","coalesce","collate","collect","column","commit","condition","connect","constraint","contains","convert","copy","corr","corresponding","cos","cosh","count","covar_pop","covar_samp","create","cross","cube","cume_dist","current","current_catalog","current_date","current_default_transform_group","current_path","current_role","current_row","current_schema","current_time","current_timestamp","current_path","current_role","current_transform_group_for_type","current_user","cursor","cycle","date","day","deallocate","dec","decimal","decfloat","declare","default","define","delete","dense_rank","deref","describe","deterministic","disconnect","distinct","double","drop","dynamic","each","element","else","empty","end","end_frame","end_partition","end-exec","equals","escape","every","except","exec","execute","exists","exp","external","extract","false","fetch","filter","first_value","float","floor","for","foreign","frame_row","free","from","full","function","fusion","get","global","grant","group","grouping","groups","having","hold","hour","identity","in","indicator","initial","inner","inout","insensitive","insert","int","integer","intersect","intersection","interval","into","is","join","json_array","json_arrayagg","json_exists","json_object","json_objectagg","json_query","json_table","json_table_primitive","json_value","lag","language","large","last_value","lateral","lead","leading","left","like","like_regex","listagg","ln","local","localtime","localtimestamp","log","log10","lower","match","match_number","match_recognize","matches","max","member","merge","method","min","minute","mod","modifies","module","month","multiset","national","natural","nchar","nclob","new","no","none","normalize","not","nth_value","ntile","null","nullif","numeric","octet_length","occurrences_regex","of","offset","old","omit","on","one","only","open","or","order","out","outer","over","overlaps","overlay","parameter","partition","pattern","per","percent","percent_rank","percentile_cont","percentile_disc","period","portion","position","position_regex","power","precedes","precision","prepare","primary","procedure","ptf","range","rank","reads","real","recursive","ref","references","referencing","regr_avgx","regr_avgy","regr_count","regr_intercept","regr_r2","regr_slope","regr_sxx","regr_sxy","regr_syy","release","result","return","returns","revoke","right","rollback","rollup","row","row_number","rows","running","savepoint","scope","scroll","search","second","seek","select","sensitive","session_user","set","show","similar","sin","sinh","skip","smallint","some","specific","specifictype","sql","sqlexception","sqlstate","sqlwarning","sqrt","start","static","stddev_pop","stddev_samp","submultiset","subset","substring","substring_regex","succeeds","sum","symmetric","system","system_time","system_user","table","tablesample","tan","tanh","then","time","timestamp","timezone_hour","timezone_minute","to","trailing","translate","translate_regex","translation","treat","trigger","trim","trim_array","true","truncate","uescape","union","unique","unknown","unnest","update","upper","user","using","value","values","value_of","var_pop","var_samp","varbinary","varchar","varying","versioning","when","whenever","where","width_bucket","window","with","within","without","year","add","asc","collation","desc","final","first","last","view"].filter((e=>!r.includes(e))),c={
+ begin:n.concat(/\b/,n.either(...o),/\s*\(/),relevance:0,keywords:{built_in:o}}
+ ;return{name:"SQL",case_insensitive:!0,illegal:/[{}]|<\//,keywords:{
+ $pattern:/\b[\w\.]+/,keyword:((e,{exceptions:n,when:t}={})=>{const a=t
+ ;return n=n||[],e.map((e=>e.match(/\|\d+$/)||n.includes(e)?e:a(e)?e+"|0":e))
+ })(l,{when:e=>e.length<3}),literal:a,type:i,
+ built_in:["current_catalog","current_date","current_default_transform_group","current_path","current_role","current_schema","current_transform_group_for_type","current_user","session_user","system_time","system_user","current_time","localtime","current_timestamp","localtimestamp"]
+ },contains:[{begin:n.either(...s),relevance:0,keywords:{$pattern:/[\w\.]+/,
+ keyword:l.concat(s),literal:a,type:i}},{className:"type",
+ begin:n.either("double precision","large object","with timezone","without timezone")
+ },c,{className:"variable",begin:/@[a-z0-9][a-z0-9_]*/},{className:"string",
+ variants:[{begin:/'/,end:/'/,contains:[{begin:/''/}]}]},{begin:/"/,end:/"/,
+ contains:[{begin:/""/}]},e.C_NUMBER_MODE,e.C_BLOCK_COMMENT_MODE,t,{
+ className:"operator",begin:/[-+*/=%^~]|&&?|\|\|?|!=?|<(?:=>?|<|>)?|>[>=]?/,
+ relevance:0}]}},grmr_swift:e=>{const n={match:/\s+/,relevance:0
+ },t=e.COMMENT("/\\*","\\*/",{contains:["self"]}),a=[e.C_LINE_COMMENT_MODE,t],i={
+ match:[/\./,m(...xe,...Me)],className:{2:"keyword"}},r={match:b(/\./,m(...Ae)),
+ relevance:0},s=Ae.filter((e=>"string"==typeof e)).concat(["_|0"]),o={variants:[{
+ className:"keyword",
+ match:m(...Ae.filter((e=>"string"!=typeof e)).concat(Se).map(ke),...Me)}]},l={
+ $pattern:m(/\b\w+/,/#\w+/),keyword:s.concat(Re),literal:Ce},c=[i,r,o],g=[{
+ match:b(/\./,m(...De)),relevance:0},{className:"built_in",
+ match:b(/\b/,m(...De),/(?=\()/)}],u={match:/->/,relevance:0},p=[u,{
+ className:"operator",relevance:0,variants:[{match:Be},{match:`\\.(\\.|${Le})+`}]
+ }],_="([0-9]_*)+",h="([0-9a-fA-F]_*)+",f={className:"number",relevance:0,
+ variants:[{match:`\\b(${_})(\\.(${_}))?([eE][+-]?(${_}))?\\b`},{
+ match:`\\b0x(${h})(\\.(${h}))?([pP][+-]?(${_}))?\\b`},{match:/\b0o([0-7]_*)+\b/
+ },{match:/\b0b([01]_*)+\b/}]},E=(e="")=>({className:"subst",variants:[{
+ match:b(/\\/,e,/[0\\tnr"']/)},{match:b(/\\/,e,/u\{[0-9a-fA-F]{1,8}\}/)}]
+ }),y=(e="")=>({className:"subst",match:b(/\\/,e,/[\t ]*(?:[\r\n]|\r\n)/)
+ }),N=(e="")=>({className:"subst",label:"interpol",begin:b(/\\/,e,/\(/),end:/\)/
+ }),w=(e="")=>({begin:b(e,/"""/),end:b(/"""/,e),contains:[E(e),y(e),N(e)]
+ }),v=(e="")=>({begin:b(e,/"/),end:b(/"/,e),contains:[E(e),N(e)]}),O={
+ className:"string",
+ variants:[w(),w("#"),w("##"),w("###"),v(),v("#"),v("##"),v("###")]
+ },k=[e.BACKSLASH_ESCAPE,{begin:/\[/,end:/\]/,relevance:0,
+ contains:[e.BACKSLASH_ESCAPE]}],x={begin:/\/[^\s](?=[^/\n]*\/)/,end:/\//,
+ contains:k},M=e=>{const n=b(e,/\//),t=b(/\//,e);return{begin:n,end:t,
+ contains:[...k,{scope:"comment",begin:`#(?!.*${t})`,end:/$/}]}},S={
+ scope:"regexp",variants:[M("###"),M("##"),M("#"),x]},A={match:b(/`/,Fe,/`/)
+ },C=[A,{className:"variable",match:/\$\d+/},{className:"variable",
+ match:`\\$${ze}+`}],T=[{match:/(@|#(un)?)available/,scope:"keyword",starts:{
+ contains:[{begin:/\(/,end:/\)/,keywords:Pe,contains:[...p,f,O]}]}},{
+ scope:"keyword",match:b(/@/,m(...je))},{scope:"meta",match:b(/@/,Fe)}],R={
+ match:d(/\b[A-Z]/),relevance:0,contains:[{className:"type",
+ match:b(/(AV|CA|CF|CG|CI|CL|CM|CN|CT|MK|MP|MTK|MTL|NS|SCN|SK|UI|WK|XC)/,ze,"+")
+ },{className:"type",match:Ue,relevance:0},{match:/[?!]+/,relevance:0},{
+ match:/\.\.\./,relevance:0},{match:b(/\s+&\s+/,d(Ue)),relevance:0}]},D={
+ begin:/,end:/>/,keywords:l,contains:[...a,...c,...T,u,R]};R.contains.push(D)
+ ;const I={begin:/\(/,end:/\)/,relevance:0,keywords:l,contains:["self",{
+ match:b(Fe,/\s*:/),keywords:"_|0",relevance:0
+ },...a,S,...c,...g,...p,f,O,...C,...T,R]},L={begin:/,end:/>/,
+ keywords:"repeat each",contains:[...a,R]},B={begin:/\(/,end:/\)/,keywords:l,
+ contains:[{begin:m(d(b(Fe,/\s*:/)),d(b(Fe,/\s+/,Fe,/\s*:/))),end:/:/,
+ relevance:0,contains:[{className:"keyword",match:/\b_\b/},{className:"params",
+ match:Fe}]},...a,...c,...p,f,O,...T,R,I],endsParent:!0,illegal:/["']/},$={
+ match:[/(func|macro)/,/\s+/,m(A.match,Fe,Be)],className:{1:"keyword",
+ 3:"title.function"},contains:[L,B,n],illegal:[/\[/,/%/]},z={
+ match:[/\b(?:subscript|init[?!]?)/,/\s*(?=[<(])/],className:{1:"keyword"},
+ contains:[L,B,n],illegal:/\[|%/},F={match:[/operator/,/\s+/,Be],className:{
+ 1:"keyword",3:"title"}},U={begin:[/precedencegroup/,/\s+/,Ue],className:{
+ 1:"keyword",3:"title"},contains:[R],keywords:[...Te,...Ce],end:/}/}
+ ;for(const e of O.variants){const n=e.contains.find((e=>"interpol"===e.label))
+ ;n.keywords=l;const t=[...c,...g,...p,f,O,...C];n.contains=[...t,{begin:/\(/,
+ end:/\)/,contains:["self",...t]}]}return{name:"Swift",keywords:l,
+ contains:[...a,$,z,{beginKeywords:"struct protocol class extension enum actor",
+ end:"\\{",excludeEnd:!0,keywords:l,contains:[e.inherit(e.TITLE_MODE,{
+ className:"title.class",begin:/[A-Za-z$_][\u00C0-\u02B80-9A-Za-z$_]*/}),...c]
+ },F,U,{beginKeywords:"import",end:/$/,contains:[...a],relevance:0
+ },S,...c,...g,...p,f,O,...C,...T,R,I]}},grmr_typescript:e=>{
+ const n=Oe(e),t=_e,a=["any","void","number","boolean","string","object","never","symbol","bigint","unknown"],i={
+ beginKeywords:"namespace",end:/\{/,excludeEnd:!0,
+ contains:[n.exports.CLASS_REFERENCE]},r={beginKeywords:"interface",end:/\{/,
+ excludeEnd:!0,keywords:{keyword:"interface extends",built_in:a},
+ contains:[n.exports.CLASS_REFERENCE]},s={$pattern:_e,
+ keyword:he.concat(["type","namespace","interface","public","private","protected","implements","declare","abstract","readonly","enum","override"]),
+ literal:fe,built_in:ve.concat(a),"variable.language":we},o={className:"meta",
+ begin:"@"+t},l=(e,n,t)=>{const a=e.contains.findIndex((e=>e.label===n))
+ ;if(-1===a)throw Error("can not find mode to replace");e.contains.splice(a,1,t)}
+ ;return Object.assign(n.keywords,s),
+ n.exports.PARAMS_CONTAINS.push(o),n.contains=n.contains.concat([o,i,r]),
+ l(n,"shebang",e.SHEBANG()),l(n,"use_strict",{className:"meta",relevance:10,
+ begin:/^\s*['"]use strict['"]/
+ }),n.contains.find((e=>"func.def"===e.label)).relevance=0,Object.assign(n,{
+ name:"TypeScript",aliases:["ts","tsx","mts","cts"]}),n},grmr_vbnet:e=>{
+ const n=e.regex,t=/\d{1,2}\/\d{1,2}\/\d{4}/,a=/\d{4}-\d{1,2}-\d{1,2}/,i=/(\d|1[012])(:\d+){0,2} *(AM|PM)/,r=/\d{1,2}(:\d{1,2}){1,2}/,s={
+ className:"literal",variants:[{begin:n.concat(/# */,n.either(a,t),/ *#/)},{
+ begin:n.concat(/# */,r,/ *#/)},{begin:n.concat(/# */,i,/ *#/)},{
+ begin:n.concat(/# */,n.either(a,t),/ +/,n.either(i,r),/ *#/)}]
+ },o=e.COMMENT(/'''/,/$/,{contains:[{className:"doctag",begin:/<\/?/,end:/>/}]
+ }),l=e.COMMENT(null,/$/,{variants:[{begin:/'/},{begin:/([\t ]|^)REM(?=\s)/}]})
+ ;return{name:"Visual Basic .NET",aliases:["vb"],case_insensitive:!0,
+ classNameAliases:{label:"symbol"},keywords:{
+ keyword:"addhandler alias aggregate ansi as async assembly auto binary by byref byval call case catch class compare const continue custom declare default delegate dim distinct do each equals else elseif end enum erase error event exit explicit finally for friend from function get global goto group handles if implements imports in inherits interface into iterator join key let lib loop me mid module mustinherit mustoverride mybase myclass namespace narrowing new next notinheritable notoverridable of off on operator option optional order overloads overridable overrides paramarray partial preserve private property protected public raiseevent readonly redim removehandler resume return select set shadows shared skip static step stop structure strict sub synclock take text then throw to try unicode until using when where while widening with withevents writeonly yield",
+ built_in:"addressof and andalso await directcast gettype getxmlnamespace is isfalse isnot istrue like mod nameof new not or orelse trycast typeof xor cbool cbyte cchar cdate cdbl cdec cint clng cobj csbyte cshort csng cstr cuint culng cushort",
+ type:"boolean byte char date decimal double integer long object sbyte short single string uinteger ulong ushort",
+ literal:"true false nothing"},
+ illegal:"//|\\{|\\}|endif|gosub|variant|wend|^\\$ ",contains:[{
+ className:"string",begin:/"(""|[^/n])"C\b/},{className:"string",begin:/"/,
+ end:/"/,illegal:/\n/,contains:[{begin:/""/}]},s,{className:"number",relevance:0,
+ variants:[{begin:/\b\d[\d_]*((\.[\d_]+(E[+-]?[\d_]+)?)|(E[+-]?[\d_]+))[RFD@!#]?/
+ },{begin:/\b\d[\d_]*((U?[SIL])|[%&])?/},{begin:/&H[\dA-F_]+((U?[SIL])|[%&])?/},{
+ begin:/&O[0-7_]+((U?[SIL])|[%&])?/},{begin:/&B[01_]+((U?[SIL])|[%&])?/}]},{
+ className:"label",begin:/^\w+:/},o,l,{className:"meta",
+ begin:/[\t ]*#(const|disable|else|elseif|enable|end|externalsource|if|region)\b/,
+ end:/$/,keywords:{
+ keyword:"const disable else elseif enable end externalsource if region then"},
+ contains:[l]}]}},grmr_wasm:e=>{e.regex;const n=e.COMMENT(/\(;/,/;\)/)
+ ;return n.contains.push("self"),{name:"WebAssembly",keywords:{$pattern:/[\w.]+/,
+ keyword:["anyfunc","block","br","br_if","br_table","call","call_indirect","data","drop","elem","else","end","export","func","global.get","global.set","local.get","local.set","local.tee","get_global","get_local","global","if","import","local","loop","memory","memory.grow","memory.size","module","mut","nop","offset","param","result","return","select","set_global","set_local","start","table","tee_local","then","type","unreachable"]
+ },contains:[e.COMMENT(/;;/,/$/),n,{match:[/(?:offset|align)/,/\s*/,/=/],
+ className:{1:"keyword",3:"operator"}},{className:"variable",begin:/\$[\w_]+/},{
+ match:/(\((?!;)|\))+/,className:"punctuation",relevance:0},{
+ begin:[/(?:func|call|call_indirect)/,/\s+/,/\$[^\s)]+/],className:{1:"keyword",
+ 3:"title.function"}},e.QUOTE_STRING_MODE,{match:/(i32|i64|f32|f64)(?!\.)/,
+ className:"type"},{className:"keyword",
+ match:/\b(f32|f64|i32|i64)(?:\.(?:abs|add|and|ceil|clz|const|convert_[su]\/i(?:32|64)|copysign|ctz|demote\/f64|div(?:_[su])?|eqz?|extend_[su]\/i32|floor|ge(?:_[su])?|gt(?:_[su])?|le(?:_[su])?|load(?:(?:8|16|32)_[su])?|lt(?:_[su])?|max|min|mul|nearest|neg?|or|popcnt|promote\/f32|reinterpret\/[fi](?:32|64)|rem_[su]|rot[lr]|shl|shr_[su]|store(?:8|16|32)?|sqrt|sub|trunc(?:_[su]\/f(?:32|64))?|wrap\/i64|xor))\b/
+ },{className:"number",relevance:0,
+ match:/[+-]?\b(?:\d(?:_?\d)*(?:\.\d(?:_?\d)*)?(?:[eE][+-]?\d(?:_?\d)*)?|0x[\da-fA-F](?:_?[\da-fA-F])*(?:\.[\da-fA-F](?:_?[\da-fA-D])*)?(?:[pP][+-]?\d(?:_?\d)*)?)\b|\binf\b|\bnan(?::0x[\da-fA-F](?:_?[\da-fA-D])*)?\b/
+ }]}},grmr_xml:e=>{
+ const n=e.regex,t=n.concat(/[\p{L}_]/u,n.optional(/[\p{L}0-9_.-]*:/u),/[\p{L}0-9_.-]*/u),a={
+ className:"symbol",begin:/&[a-z]+;|[0-9]+;|[a-f0-9]+;/},i={begin:/\s/,
+ contains:[{className:"keyword",begin:/#?[a-z_][a-z1-9_-]+/,illegal:/\n/}]
+ },r=e.inherit(i,{begin:/\(/,end:/\)/}),s=e.inherit(e.APOS_STRING_MODE,{
+ className:"string"}),o=e.inherit(e.QUOTE_STRING_MODE,{className:"string"}),l={
+ endsWithParent:!0,illegal:/,relevance:0,contains:[{className:"attr",
+ begin:/[\p{L}0-9._:-]+/u,relevance:0},{begin:/=\s*/,relevance:0,contains:[{
+ className:"string",endsParent:!0,variants:[{begin:/"/,end:/"/,contains:[a]},{
+ begin:/'/,end:/'/,contains:[a]},{begin:/[^\s"'=<>`]+/}]}]}]};return{
+ name:"HTML, XML",
+ aliases:["html","xhtml","rss","atom","xjb","xsd","xsl","plist","wsf","svg"],
+ case_insensitive:!0,unicodeRegex:!0,contains:[{className:"meta",begin://,relevance:10,contains:[i,o,s,r,{begin:/\[/,end:/\]/,contains:[{
+ className:"meta",begin://,contains:[i,r,o,s]}]}]
+ },e.COMMENT(//,{relevance:10}),{begin://,
+ relevance:10},a,{className:"meta",end:/\?>/,variants:[{begin:/<\?xml/,
+ relevance:10,contains:[o]},{begin:/<\?[a-z][a-z0-9]+/}]},{className:"tag",
+ begin:/
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Jump to content
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Toggle the table of contents
+
+
+
+
+
+ Apple
+
+
+
+
+
+191 languages
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
English
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Tools
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
From Wikipedia, the free encyclopedia
+
+
+
+
+
Fruit that grows on a tree
+
+
+
+
+
+
+
+
+
An apple is a round, edible fruit produced by an apple tree (Malus spp. , among them the domestic or orchard apple ; Malus domestica ). Apple trees are cultivated worldwide and are the most widely grown species in the genus Malus . The tree originated in Central Asia , where its wild ancestor, Malus sieversii , is still found. Apples have been grown for thousands of years in Eurasia and were introduced to North America by European colonists . Apples have religious and mythological significance in many cultures, including Norse , Greek , and European Christian tradition.
+
Apples grown from seed tend to be very different from those of their parents, and the resultant fruit frequently lacks desired characteristics. For commercial purposes, including botanical evaluation, apple cultivars are propagated by clonal grafting onto rootstocks . Apple trees grown without rootstocks tend to be larger and much slower to fruit after planting. Rootstocks are used to control the speed of growth and the size of the resulting tree, allowing for easier harvesting.
+
There are more than 7,500 cultivars of apples . Different cultivars are bred for various tastes and uses, including cooking , eating raw, and cider or apple juice production. Trees and fruit are prone to fungal , bacterial, and pest problems, which can be controlled by a number of organic and non-organic means. In 2010, the fruit's genome was sequenced as part of research on disease control and selective breeding in apple production.
+
+
+
Etymology
+
The word apple , whose Old English ancestor is æppel , is descended from the Proto-Germanic noun *aplaz , descended in turn from Proto-Indo-European *h₂ébōl .[ 3] As late as the 17th century, the word also functioned as a generic term for all fruit, including nuts . This can be compared to the 14th-century Middle English expression appel of paradis , meaning a banana .[ 4]
+
+
Description
+
The apple is a deciduous tree, generally standing 2 to 4.5 metres (6 to 15 feet) tall in cultivation and up to 15 m (49 ft) in the wild, though more typically 2 to 10 m (6.5 to 33 ft).[ 5] [ 1] When cultivated, the size, shape and branch density are determined by rootstock selection and trimming method.[ 5] Apple trees may naturally have a rounded to erect crown with a dense canopy of leaves.[ 6] The bark of the trunk is dark gray or gray-brown, but young branches are reddish or dark-brown with a smooth texture.[ 1] [ 7] When young twigs are covered in very fine downy hairs and become hairless as they become older.[ 7]
+
The buds are egg-shaped and dark red or purple in color; they range in size from 3 to 5 millimeters, but are usually less than 4 mm. The bud scales have very hairy edges. When emerging from the buds, the leaves are convolute , meaning that their edges overlap each other.[ 1] Leaves can be simple ovals (elliptic ), medium or wide in width, somewhat egg-shaped with the wider portion toward their base (ovate ), or even with sides that are more parallel to each other instead of curved (oblong ) with a narrow pointed end.[ 7] [ 1] The edges have broadly-angled teeth, but do not have lobes. The top surface of the leaves are glabrescent , almost hairless, while the undersides are densely covered in fine hairs.[ 1] The leaves are attached alternately by short leaf stems 1-to-3.5 cm (1 ⁄2 -to-1+ 1 ⁄2 in) long.[ 6] [ 1]
+
Blossoms are produced in spring simultaneously with the budding of the leaves and are produced on spurs and some long shoots .[ 5] When the flower buds first begin to open the petals are rose-pink and fade to white or light pink when fully open with each flower 3-to-4-centimeter (1-to-1+ 1 ⁄2 -inch) in diameter.[ 1] The five-petaled flowers are group in an inflorescence consisting of a cyme with 3–7 flowers.[ 8] The central flower of the inflorescence is called the "king bloom"; it opens first and can develop a larger fruit.[ 6] Open apple blossoms are damaged by even brief exposures to temperatures −2 °C (28 °F) or less, although the overwintering wood and buds are hardy down to −40 °C (−40 °F).[ 8]
+
+
+
+
+ Apple blossoms
+
+
+
+ Botanical illustration
+
+
+
Fruit
+
The fruit is a pome that matures in late summer or autumn .[ 1] The true fruits or carpels are the harder interior chambers inside the apple's core. There are usually five carpels inside an apple, but there may be as few as three. Each of the chambers contains one or two seeds.[ 9] The edible flesh is formed from the receptacle at the base of the flower.[ 10]
+
+
+
The seeds are egg- to pear-shaped and may be colored from light brown or tan to a very dark brown, often with red shades or even purplish-black. They may have a blunt or sharp point.[ 11] The five sepals remain attached and stand out from the surface of the apple.[ 1]
+
The size of the fruit varies widely between cultivars, but generally has a diameter between 2.5 and 12 cm (1 and 5 in).[ 7] The shape is quite variable and may be nearly round, elongated, conical, or short and wide.[ 12]
+
The groundcolor of ripe apples is yellow, green, yellow-green or whitish yellow. The overcolor of ripe apples can be orange-red, pink-red, red, purple-red or brown-red. The overcolor amount can be 0–100%.[ 13] The skin may be wholly or partly russeted , making it rough and brown. The skin is covered in a protective layer of epicuticular wax .[ 14] The skin may also be marked with scattered dots.[ 1] The flesh is generally pale yellowish-white, though it can be pink, yellow or green.[ 13]
+
+
+ Apples can have any amount of overcolor, a darker tint over a pale groundcolor.
+
+
+ 0% overcolor
+
+
+
+ 100% overcolor
+
+
+
Chemistry
+
Important volatile compounds in apples that contribute to their scent and flavour include acetaldehyde , ethyl acetate , 1-butanal , ethanol , 2-methylbutanal, 3-methylbutanal , ethyl propionate , ethyl 2-methylpropionate, ethyl butyrate , ethyl 2-methyl butyrate, hexanal , 1-butanol , 3-methylbutyl acetate , 2-methylbutyl acetate, 1-propyl butyrate, ethyl pentanoate , amyl acetate , 2-methyl-1-butanol , trans-2-hexenal, ethyl hexanoate , hexanol .[ 15] [ 16]
+
+
Taxonomy
+
The apple as a species has more than 100 alternative scientific names, or synonyms .[ 17] In modern times, Malus pumila and Malus domestica are the two main names in use. M. pumila is the older name, but M. domestica has become much more commonly used starting in the 21st century, especially in the western world. Two proposals were made to make M. domestica a conserved name : the earlier proposal was voted down by the Committee for Vascular Plants of the IAPT in 2014, but in April 2017 the Committee decided, with a narrow majority, that the newly popular name should be conserved.[ 18] The General Committee of the IAPT decided in June 2017 to approve this change, officially conserving M. domestica .[ 19] Nevertheless, some works published after 2017 still use M. pumila as the correct name , under an alternate taxonomy.[ 2]
+
When first classified by Linnaeus in 1753, the pears, apples, and quinces were combined into one genus that he named Pyrus and he named the apple as Pyrus malus . This was widely accepted, however the botanist Philip Miller published an alternate classification in The Gardeners Dictionary with the apple species separated from Pyrus in 1754. He did not clearly indicate that by Malus pumila he meant the domesticated apple. Nonetheless, it was used as such by many botanists. When Moritz Balthasar Borkhausen published his scientific description of the apple in 1803 it may have been a new combination of P. malus var. domestica , but this was not directly referenced by Borkhausen.[ 17] The earliest use of var. domestica for the apple was by Georg Adolf Suckow in 1786.[ 2]
+
+
Genome
+
+
Apples are diploid , with two sets of chromosomes per cell (though triploid cultivars, with three sets, are not uncommon), have 17 chromosomes and an estimated genome size of approximately 650 Mb. Several whole genome sequences have been completed and made available. The first one in 2010 was based on the diploid cultivar 'Golden Delicious '.[ 20] However, this first whole genome sequence contained several errors,[ 21] in part owing to the high degree of heterozygosity in diploid apples which, in combination with an ancient genome duplication, complicated the assembly. Recently, double- and trihaploid individuals have been sequenced, yielding whole genome sequences of higher quality.[ 22] [ 23]
+
The first whole genome assembly was estimated to contain around 57,000 genes,[ 20] though the more recent genome sequences support estimates between 42,000 and 44,700 protein-coding genes.[ 22] [ 23] The availability of whole genome sequences has provided evidence that the wild ancestor of the cultivated apple most likely is Malus sieversii . Re-sequencing of multiple accessions has supported this, while also suggesting extensive introgression from Malus sylvestris following domestication.[ 24]
+
+
Cultivation
+
History
+
Map of the origins of the cultivated apple. The wild origin is in Kazakhstan; hybridisations and repeated domestications followed, modifying many attributes of the fruit.[ 24]
+
Wild Malus sieversii apple in Kazakhstan
+
Central Asia is generally considered the center of origin for apples due to the genetic variability in specimens there.[ 25] The wild ancestor of Malus domestica was Malus sieversii , found growing wild in the mountains of Central Asia in southern Kazakhstan , Kyrgyzstan , Tajikistan , and northwestern China .[ 5] [ 26] Cultivation of the species, most likely beginning on the forested flanks of the Tian Shan mountains, progressed over a long period of time and permitted secondary introgression of genes from other species into the open-pollinated seeds. Significant exchange with Malus sylvestris , the crabapple, resulted in populations of apples being more related to crabapples than to the more morphologically similar progenitor Malus sieversii . In strains without recent admixture the contribution of the latter predominates.[ 27] [ 28] [ 29]
+
The apple is thought to have been domesticated 4,000–10,000 years ago in the Tian Shan mountains, and then to have travelled along the Silk Road to Europe, with hybridization and introgression of wild crabapples from Siberia (M. baccata ), the Caucasus (M. orientalis ), and Europe (M. sylvestris ). Only the M. sieversii trees growing on the western side of the Tian Shan mountains contributed genetically to the domesticated apple, not the isolated population on the eastern side.[ 24]
+
Chinese soft apples, such as M. asiatica and M. prunifolia , have been cultivated as dessert apples for more than 2,000 years in China. These are thought to be hybrids between M. baccata and M. sieversii in Kazakhstan.[ 24]
+
Among the traits selected for by human growers are size, fruit acidity, color, firmness, and soluble sugar. Unusually for domesticated fruits, the wild M. sieversii origin is only slightly smaller than the modern domesticated apple.[ 24]
+
At the Sammardenchia-Cueis site near Udine in Northeastern Italy, seeds from some form of apples have been found in material carbon dated to between 6570 and 5684 BCE.[ 30] Genetic analysis has not yet been successfully used to determine whether such ancient apples were wild Malus sylvestris or Malus domesticus containing Malus sieversii ancestry. It is hard to distinguish in the archeological record between foraged wild apples and apple plantations.[ 31]
+
There is indirect evidence of apple cultivation in the third millennium BCE in the Middle East .[ 31] There is direct evidence, apple cores, dated to the 10th century BCE from a Judean site between the Sinai and Negev.
+[ 32] There was substantial apple production in European classical antiquity, and grafting was certainly known then.[ 31] Grafting is an essential part of modern domesticated apple production, to be able to propagate the best cultivars; it is unclear when apple tree grafting was invented.[ 31]
+
+
+
+
+
+
+
Duration: 1 hour, 1 minute and 36 seconds. 1:01:36
+
+
+
+
The Roman writer Pliny the Elder describes a method of storage for apples from his time in the 1st century. He says they should be placed in a room with good air circulation from a north facing window on a bed of straw, chaff, or mats with windfalls kept separately.[ 33] Though methods like this will extend the availabity of reasonably fresh apples, without refrigeration their lifespan is limited. Even sturdy winter apple varieties will only keep well until December in cool climates.[ 34] For longer storage medieval Europeans strung up cored and peeled apples to dry, either whole or sliced into rings.[ 35]
+
Of the many Old World plants that the Spanish introduced to Chiloé Archipelago in the 16th century, apple trees became particularly well adapted.[ 36] Apples were introduced to North America by colonists in the 17th century,[ 5] and the first named apple cultivar was introduced in Boston by Reverend William Blaxton in 1640.[ 37] The only apples native to North America are crab apples .[ 38]
+
Apple cultivars brought as seed from Europe were spread along Native American trade routes, as well as being cultivated on colonial farms. An 1845 United States apples nursery catalogue sold 350 of the "best" cultivars, showing the proliferation of new North American cultivars by the early 19th century.[ 38] In the 20th century, irrigation projects in Eastern Washington began and allowed the development of the multibillion-dollar fruit industry, of which the apple is the leading product.[ 5]
+
Until the 20th century, farmers stored apples in frostproof cellars during the winter for their own use or for sale. Improved transportation of fresh apples by train and road replaced the necessity for storage.[ 39] [ 40] Controlled atmosphere facilities are used to keep apples fresh year-round. Controlled atmosphere facilities use high humidity, low oxygen, and controlled carbon dioxide levels to maintain fruit freshness. They were first researched at Cambridge University in the 1920s and first used in the United States in the 1950s.[ 41]
+
+
Breeding
+
+
An apple tree in Germany
+
Many apples grow readily from seeds. However, apples must be propagated asexually to obtain cuttings with the characteristics of the parent. This is because seedling apples are "extreme heterozygotes ". Rather than resembling their parents, seedlings are all different from each other and from their parents.[ 42] Triploid cultivars have an additional reproductive barrier in that three sets of chromosomes cannot be divided evenly during meiosis, yielding unequal segregation of the chromosomes (aneuploids). Even in the case when a triploid plant can produce a seed (apples are an example), it occurs infrequently, and seedlings rarely survive.[ 43]
+
Because apples are not true breeders when planted as seeds, propagation usually involves grafting of cuttings. The rootstock used for the bottom of the graft can be selected to produce trees of a large variety of sizes, as well as changing the winter hardiness, insect and disease resistance, and soil preference of the resulting tree. Dwarf rootstocks can be used to produce very small trees (less than 3.0 m or 10 ft high at maturity), which bear fruit many years earlier in their life cycle than full size trees, and are easier to harvest.[ 44]
+
Dwarf rootstocks for apple trees can be traced as far back as 300 BCE, to the area of Persia and Asia Minor . Alexander the Great sent samples of dwarf apple trees to Aristotle 's Lyceum . Dwarf rootstocks became common by the 15th century and later went through several cycles of popularity and decline throughout the world.[ 45] The majority of the rootstocks used to control size in apples were developed in England in the early 1900s. The East Malling Research Station conducted extensive research into rootstocks, and their rootstocks are given an "M" prefix to designate their origin. Rootstocks marked with an "MM" prefix are Malling-series cultivars later crossed with trees of 'Northern Spy ' in Merton, England .[ 46]
+
Most new apple cultivars originate as seedlings, which either arise by chance or are bred by deliberately crossing cultivars with promising characteristics.[ 47] The words "seedling", "pippin", and "kernel" in the name of an apple cultivar suggest that it originated as a seedling. Apples can also form bud sports (mutations on a single branch). Some bud sports turn out to be improved strains of the parent cultivar. Some differ sufficiently from the parent tree to be considered new cultivars.[ 48]
+
Apples have been acclimatized in Ecuador at very high altitudes, where they can often, with the needed factors, provide crops twice per year because of constant temperate conditions year-round.[ 49]
+
+
Pollination
+
+
Apple blossom from an old Ayrshire cultivar
+
An orchard mason bee on an apple bloom in British Columbia , Canada
+
Apples are self-incompatible; they must cross-pollinate to develop fruit. During the flowering each season, apple growers often utilize pollinators to carry pollen. Honey bees are most commonly used. Orchard mason bees are also used as supplemental pollinators in commercial orchards. Bumblebee queens are sometimes present in orchards, but not usually in sufficient number to be significant pollinators.[ 48] [ 50]
+
Cultivars are sometimes classified by the day of peak bloom in the average 30-day blossom period, with pollinizers selected from cultivars within a 6-day overlap period. There are four to seven pollination groups in apples, depending on climate:
+
+
Group A – Early flowering, 1 to 3 May in England ('Gravenstein ', 'Red Astrachan')
+Group B – 4 to 7 May ('Idared ', 'McIntosh ')
+Group C – Mid-season flowering, 8 to 11 May ('Granny Smith ', 'Cox's Orange Pippin ')
+Group D – Mid/late season flowering, 12 to 15 May ('Golden Delicious ', 'Calville blanc d'hiver')
+Group E – Late flowering, 16 to 18 May ('Braeburn ', 'Reinette d'Orléans')
+Group F – 19 to 23 May ('Suntan')
+Group H – 24 to 28 May ('Court-Pendu Gris' – also called Court-Pendu plat)
+
One cultivar can be pollinated by a compatible cultivar from the same group or close (A with A, or A with B, but not A with C or D).[ 51]
+
+
Maturation and harvest
+
+
L. K. Relander , the former President of Finland , with his family picking apples in the 1930s
+
Cultivars vary in their yield and the ultimate size of the tree, even when grown on the same rootstock. Some cultivars, if left unpruned, grow very large—letting them bear more fruit, but making harvesting more difficult. Depending on tree density (number of trees planted per unit surface area), mature trees typically bear 40–200 kg (90–440 lb) of apples each year, though productivity can be close to zero in poor years. Apples are harvested using three-point ladders that are designed to fit amongst the branches. Trees grafted on dwarfing rootstocks bear about 10–80 kg (20–180 lb) of fruit per year.[ 48]
+
Some farms with apple orchards open them to the public so consumers can pick their own apples.[ 52]
+
Crops ripen at different times of the year according to the cultivar. Cultivar that yield their crop in the summer include 'Sweet Bough ' and 'Duchess'; fall producers include 'Blenheim'; winter producers include 'King', 'Swayzie ', and 'Tolman Sweet'.[ 38]
+
+
Storage
+
Different apple cultivars in a wholesale food market
+
Commercially, apples can be stored for months in controlled atmosphere chambers. Apples are commonly stored in chambers with lowered concentrations of oxygen to reduce respiration and slow softening and other changes if the fruit is already fully ripe. The gas ethylene is used by plants as a hormone which promotes ripening, decreasing the time an apple can be stored. For storage longer than about six months the apples are picked earlier, before full ripeness, when ethylene production by the fruit is low. However, in many varieties this increases their sensitivity to carbon dioxide , which also must be controlled.[ 53]
+
For home storage, most culitvars of apple can be stored for three weeks in a pantry and four to six weeks from the date of purchase in a refrigerator that maintains 4 to 0 °C (39 to 32 °F).[ 54] [ 55] Some varieties of apples (e.g. 'Granny Smith ' and 'Fuji ') have more than three times the storage life of others.[ 56]
+
Non-organic apples may be sprayed with a substance 1-methylcyclopropene blocking the apples' ethylene receptors, temporarily preventing them from ripening.[ 57]
+
+
Pests and diseases
+
+
Codling moth larva tunnelling inside an apple
+
Apple trees are susceptible to fungal and bacterial diseases, and to damage by insect pests. Many commercial orchards pursue a program of chemical sprays to maintain high fruit quality, tree health, and high yields. These prohibit the use of synthetic pesticides, though some older pesticides are allowed. Organic methods include, for instance, introducing its natural predator to reduce the population of a particular pest.
+
A wide range of pests and diseases can affect the plant. Three of the more common diseases or pests are mildew, aphids, and apple scab.
+
+
Mildew is characterized by light grey powdery patches appearing on the leaves, shoots and flowers, normally in spring. The flowers turn a creamy yellow color and do not develop correctly. This can be treated similarly to Botrytis —eliminating the conditions that caused the disease and burning the infected plants are among recommended actions.[ 58]
+Aphids are small insects with sucking mouthparts . Five species of aphids commonly attack apples: apple grain aphid, rosy apple aphid, apple aphid, spirea aphid, and the woolly apple aphid. The aphid species can be identified by color, time of year, and by differences in the cornicles (small paired projections from their rear).[ 59] Aphids feed on foliage using needle-like mouth parts to suck out plant juices. When present in high numbers, certain species reduce tree growth and vigor.[ 60]
+Apple scab : Apple scab causes leaves to develop olive-brown spots with a velvety texture that later turn brown and become cork-like in texture. The disease also affects the fruit, which also develops similar brown spots with velvety or cork-like textures. Apple scab is spread through fungus growing in old apple leaves on the ground and spreads during warm spring weather to infect the new year's growth.[ 61]
+
Among the most serious disease problems is a bacterial disease called fireblight , and three fungal diseases: Gymnosporangium rust, black spot ,[ 62] and bitter rot .[ 63] Codling moths , and the apple maggots of fruit flies, cause serious damage to apple fruits, making them unsaleable. Young apple trees are also prone to mammal pests like mice and deer, which feed on the soft bark of the trees, especially in winter.[ 61] The larvae of the apple clearwing moth (red-belted clearwing) burrow through the bark and into the phloem of apple trees, potentially causing significant damage.[ 64]
+
+
Cultivars
+
+
An assortment of apple cultivars
+
There are more than 7,500 known cultivars (cultivated varieties) of apples.[ 65] Cultivars vary in their yield and the ultimate size of the tree, even when grown on the same rootstock .[ 66] Different cultivars are available for temperate and subtropical climates. The UK's National Fruit Collection, which is the responsibility of the Department of Environment, Food, and Rural Affairs, includes a collection of over 2,000 cultivars of apple tree in Kent.[ 67] The University of Reading , which is responsible for developing the UK national collection database, provides access to search the national collection. The University of Reading's work is part of the European Cooperative Programme for Plant Genetic Resources of which there are 38 countries participating in the Malus/Pyrus work group.[ 68]
+
The UK's national fruit collection database contains much information on the characteristics and origin of many apples, including alternative names for what is essentially the same "genetic" apple cultivar. Most of these cultivars are bred for eating fresh (dessert apples), though some are cultivated specifically for cooking (cooking apples ) or producing cider . Cider apples are typically too tart and astringent to eat fresh, but they give the beverage a rich flavor that dessert apples cannot.[ 69]
+
In the United States there are many apple breeding programs associated with universities. Cornell University has had a program operating since 1880 in Geneva, New York . Among their recent well known apples is the 'SnapDragon' cultivar released in 2013. In the west Washington State University started a program to support their apple industry in 1994 and released the 'Cosmic Crisp ' cultivar in 2017. The third most grown apple cultivar in the United States is the 'Honeycrisp ', released by the University of Minnesota program in 1991.[ 70] Unusually for a popular cultivar, the 'Honeycrisp' is not directly related to another popular apple cultivar but instead to two unsuccessful cultivars.[ 71] In Europe there are also many breeding programs such as the Julius Kühn-Institut , the German federal research center for cultivated plants.[ 72]
+
Commercially popular apple cultivars are soft but crisp. Other desirable qualities in modern commercial apple breeding are a colorful skin, absence of russeting , ease of shipping, lengthy storage ability, high yields, disease resistance, common apple shape, and developed flavor.[ 66] Modern apples are generally sweeter than older cultivars, as popular tastes in apples have varied over time. Most North Americans and Europeans favor sweet, subacid apples, but tart apples have a strong minority following.[ 73] Extremely sweet apples with barely any acid flavor are popular in Asia,[ 73] especially the Indian subcontinent .[ 69]
+
+
Less common apple cultivars from an orchard in Italy
+
Old cultivars are often oddly shaped, russeted, and grow in a variety of textures and colors. Some find them to have better flavor than modern cultivars, but they may have other problems that make them commercially unviable—low yield, disease susceptibility, poor tolerance for storage or transport, or just being the "wrong" size.[ 74] A few old cultivars are still produced on a large scale, but many have been preserved by home gardeners and farmers that sell directly to local markets. Many unusual and locally important cultivars with their own unique taste and appearance exist; apple conservation campaigns have sprung up around the world to preserve such local cultivars from extinction. In the United Kingdom, old cultivars such as 'Cox's Orange Pippin ' and 'Egremont Russet ' are still commercially important even though by modern standards they are low yielding and susceptible to disease.[ 5]
+
+
Production
+
+
+
World production of apples in 2022 was 96 million tonnes , with China producing 50% of the total (table).[ 75] Secondary producers were the United States, Turkey , and Poland .[ 75]
+
+
Toxicity
+
Amygdalin
+
Apple seeds contain small amounts of amygdalin , a sugar and cyanide compound known as a cyanogenic glycoside . Ingesting small amounts of apple seeds causes no ill effects, but consumption of extremely large doses can cause adverse reactions . It may take several hours before the poison takes effect, as cyanogenic glycosides must be hydrolyzed before the cyanide ion is released.[ 76] The U.S. National Library of Medicine 's Hazardous Substances Data Bank records no cases of amygdalin poisoning from consuming apple seeds.[ 77]
+
+
Allergy
+
One form of apple allergy, often found in northern Europe, is called birch-apple syndrome and is found in people who are also allergic to birch pollen .[ 78] Allergic reactions are triggered by a protein in apples that is similar to birch pollen, and people affected by this protein can also develop allergies to other fruits, nuts, and vegetables. Reactions, which entail oral allergy syndrome (OAS), generally involve itching and inflammation of the mouth and throat,[ 78] but in rare cases can also include life-threatening anaphylaxis .[ 79] This reaction only occurs when raw fruit is consumed—the allergen is neutralized in the cooking process. The variety of apple, maturity and storage conditions can change the amount of allergen present in individual fruits. Long storage times can increase the amount of proteins that cause birch-apple syndrome.[ 78]
+
In other areas, such as the Mediterranean, some individuals have adverse reactions to apples because of their similarity to peaches.[ 78] This form of apple allergy also includes OAS, but often has more severe symptoms, such as vomiting, abdominal pain and urticaria , and can be life-threatening. Individuals with this form of allergy can also develop reactions to other fruits and nuts. Cooking does not break down the protein causing this particular reaction, so affected individuals cannot eat raw or cooked apples. Freshly harvested, over-ripe fruits tend to have the highest levels of the protein that causes this reaction.[ 78]
+
Breeding efforts have yet to produce a hypoallergenic fruit suitable for either of the two forms of apple allergy.[ 78]
+
+
Uses
+
+
Nutrition
+
+
A raw apple is 86% water and 14% carbohydrates , with negligible content of fat and protein (table). A reference serving of a raw apple with skin weighing 100 g (3.5 oz) provides 52 calories and a moderate content of dietary fiber (table). Otherwise, there is low content of micronutrients , with the Daily Values of all falling below 10% (table).
+
+
Culinary
+
+
Machine for paring, coring, and slicing apples, from Henry B. Scammell's 1897 handbook Cyclopedia of Valuable Receipts
+
Apples varieties can be grouped as cooking apples , eating apples , and cider apples , the last so astringent as to be "almost inedible".[ 82] Apples are consumed as juice , raw in salads, baked in pies , cooked into sauces and apple butter , or baked.[ 83] They are sometimes used as an ingredient in savory foods, such as sausage and stuffing.[ 84]
+
Several techniques are used to preserve apples and apple products. Traditional methods include drying and making apple butter .[ 82] Juice and cider are produced commercially; cider is a significant industry in regions such as the West of England and Normandy .[ 82]
+
A toffee apple (UK) or caramel apple (US) is a confection made by coating an apple in hot toffee or caramel candy respectively and allowing it to cool.[ 85] [ 8] Apples and honey are a ritual food pairing eaten during the Jewish New Year of Rosh Hashanah .[ 86]
+
Apples are an important ingredient in many desserts, such as pies , crumbles , and cakes . When cooked, some apple cultivars easily form a puree known as apple sauce , which can be cooked down to form a preserve, apple butter. They are often baked or stewed , and are cooked in some meat dishes.[ 82]
+
+
+
+
+
Wikibooks
Cookbook has a recipe/module on
+
+
+
Apples are milled or pressed to produce apple juice , which may be drunk unfiltered (called apple cider in North America), or filtered. Filtered juice is often concentrated and frozen, then reconstituted later and consumed. Apple juice can be fermented to make cider (called hard cider in North America), ciderkin , and vinegar.[ 8] Through distillation , various alcoholic beverages can be produced, such as applejack , Calvados , and apple brandy .[ 8] [ 87]
+
+
Organic production
+
Organic apples are commonly produced in the United States.[ 88] Due to infestations by key insects and diseases, organic production is difficult in Europe.[ 89] The use of pesticides containing chemicals, such as sulfur, copper, microorganisms, viruses, clay powders, or plant extracts (pyrethrum , neem ) has been approved by the EU Organic Standing Committee to improve organic yield and quality.[ 89] A light coating of kaolin , which forms a physical barrier to some pests, also may help prevent apple sun scalding.[ 48]
+
+
Non-browning apples
+
Apple skins and seeds contain polyphenols .[ 90] These are oxidised by the enzyme polyphenol oxidase , which causes browning in sliced or bruised apples, by catalyzing the oxidation of phenolic compounds to o-quinones , a browning factor.[ 91] Browning reduces apple taste, color, and food value. Arctic apples , a non-browning group of apples introduced to the United States market in 2019, have been genetically modified to silence the expression of polyphenol oxidase, thereby delaying a browning effect and improving apple eating quality.[ 92] [ 93] The US Food and Drug Administration in 2015, and Canadian Food Inspection Agency in 2017, determined that Arctic apples are as safe and nutritious as conventional apples.[ 94] [ 95]
+
+
Other products
+
Apple seed oil is obtained by pressing apple seeds for manufacturing cosmetics .[ 96]
+
+
In culture
+
+
Germanic paganism
+
"Brita as Iduna " (1901) by Carl Larsson
+
In Norse mythology , the goddess Iðunn is portrayed in the Prose Edda (written in the 13th century by Snorri Sturluson ) as providing apples to the gods that give them eternal youthfulness . The English scholar H. R. Ellis Davidson links apples to religious practices in Germanic paganism , from which Norse paganism developed. She points out that buckets of apples were found in the Oseberg ship burial site in Norway, that fruit and nuts (Iðunn having been described as being transformed into a nut in Skáldskaparmál ) have been found in the early graves of the Germanic peoples in England and elsewhere on the continent of Europe, which may have had a symbolic meaning, and that nuts are still a recognized symbol of fertility in southwest England.[ 97]
+
Davidson notes a connection between apples and the Vanir , a tribe of gods associated with fertility in Norse mythology, citing an instance of eleven "golden apples" being given to woo the beautiful Gerðr by Skírnir , who was acting as messenger for the major Vanir god Freyr in stanzas 19 and 20 of Skírnismál . Davidson also notes a further connection between fertility and apples in Norse mythology in chapter 2 of the Völsunga saga : when the major goddess Frigg sends King Rerir an apple after he prays to Odin for a child, Frigg's messenger (in the guise of a crow) drops the apple in his lap as he sits atop a mound .[ 97] Rerir's wife's consumption of the apple results in a six-year pregnancy and the birth (by Caesarean section ) of their son—the hero Völsung .[ 98]
+
Further, Davidson points out the "strange" phrase "Apples of Hel " used in an 11th-century poem by the skald Thorbiorn Brúnarson. She states this may imply that the apple was thought of by Brúnarson as the food of the dead. Further, Davidson notes that the potentially Germanic goddess Nehalennia is sometimes depicted with apples and that parallels exist in early Irish stories. Davidson asserts that while cultivation of the apple in Northern Europe extends back to at least the time of the Roman Empire and came to Europe from the Near East , the native varieties of apple trees growing in Northern Europe are small and bitter. Davidson concludes that in the figure of Iðunn "we must have a dim reflection of an old symbol: that of the guardian goddess of the life-giving fruit of the other world."[ 97]
+
+
Greek mythology
+
Heracles with the apple of Hesperides
+
Apples appear in many religious traditions , including Greek and Roman mythology where it has an ambiguous symbolism of discord, fertility, or courtship.[ 99] In Greek mythology , the Greek hero Heracles , as a part of his Twelve Labours , was required to travel to the Garden of the Hesperides and pick the golden apples off the Tree of Life growing at its center.[ 100]
+
The Greek goddess of discord, Eris , became disgruntled after she was excluded from the wedding of Peleus and Thetis .[ 101] In retaliation, she tossed a golden apple inscribed Καλλίστη (Kallistē , "For the most beautiful one"), into the wedding party. Three goddesses claimed the apple: Hera , Athena , and Aphrodite . Paris of Troy was appointed to select the recipient. After being bribed by both Hera and Athena, Aphrodite tempted him with the most beautiful woman in the world, Helen of Sparta . He awarded the apple to Aphrodite, thus indirectly causing the Trojan War .[ 102] [ 103]
+
The apple was thus considered, in ancient Greece, sacred to Aphrodite. To throw an apple at someone was to symbolically declare one's love; and similarly, to catch it was to symbolically show one's acceptance of that love. An epigram claiming authorship by Plato states:[ 104]
+
+
I throw the apple at you, and if you are willing to love me, take it and share your girlhood with me; but if your thoughts are what I pray they are not, even then take it, and consider how short-lived is beauty.
+
Atalanta , also of Greek mythology, raced all her suitors in an attempt to avoid marriage. She outran all but Hippomenes (also known as Melanion , a name possibly derived from melon , the Greek word for both "apple" and fruit in general),[ 100] who defeated her by cunning, not speed. Hippomenes knew that he could not win in a fair race, so he used three golden apples (gifts of Aphrodite, the goddess of love) to distract Atalanta. It took all three apples and all of his speed, but Hippomenes was finally successful, winning the race and Atalanta's hand.[ 105] [ 106]
+
+
Celtic mythology
+
In Celtic mythology , the otherworld has many names, including Emain Ablach , "Emain of the Apple-trees". A version of this is Avalon in Arthurian legend , or in Welsh Ynys Afallon , "Island of Apples".[ 107]
+
+
China
+
Píngānguǒ ("Peace apples") on sale in Beijing for Christmas Eve (2017)
+
In China, apples symbolise peace , since the sounds of the first element ("píng") in the words "apple" (苹果, Píngguǒ ) and "peace" (平安, Píng'ān ) are homophonous in Mandarin and Cantonese.[ 3] [ 108] When these two words are combined, the word Píngānguǒ (平安果, "Peace apples") is formed. This association developed further as the name for Christmas Eve in Mandarin is Píngānyè (平安夜, "Peaceful/Quiet Evening"), which made the gifting of apples at this season to friends and associates popular, as a way to wish them peace and safety.[ 108]
+
+
Christian art
+
Adam and Eve by Albrecht Dürer (1507), showcasing the apple as a symbol of sin
+
Though the forbidden fruit of Eden in the Book of Genesis is not identified, popular Christian tradition has held that it was an apple that Eve coaxed Adam to share with her.[ 109] The origin of the popular identification with a fruit unknown in the Middle East in biblical times is found in wordplay with the Latin words mālum (an apple) and mălum (an evil), each of which is normally written malum .[ 110] The tree of the forbidden fruit is called "the tree of the knowledge of good and evil" in Genesis 2:17,[ 111] and the Latin for "good and evil" is bonum et malum .[ 112]
+
Renaissance painters may also have been influenced by the story of the golden apples in the Garden of Hesperides . As a result, in the story of Adam and Eve, the apple became a symbol for knowledge, immortality, temptation, the fall of man into sin, and sin itself. The larynx in the human throat has been called the "Adam's apple " because of a notion that it was caused by the forbidden fruit remaining in the throat of Adam. The apple as symbol of sexual seduction has been used to imply human sexuality, possibly in an ironic vein.[ 109]
+
+
Proverb
+
The proverb , "An apple a day keeps the doctor away ", addressing the supposed health benefits of the fruit, has been traced to 19th-century Wales , where the original phrase was "Eat an apple on going to bed, and you'll keep the doctor from earning his bread".[ 113] In the 19th century and early 20th, the phrase evolved to "an apple a day, no doctor to pay" and "an apple a day sends the doctor away"; the phrasing now commonly used was first recorded in 1922.[ 114]
+
+
See also
+
+
References
+
+
+^ Jump up to: a b c d e f g h i j k Dickson, Elizabeth E. (28 May 2021). "Malus domestica " . Flora of North America . Archived from the original on 28 July 2024. Retrieved 27 July 2024 .
+
+^ Jump up to: a b c "Malus domestica (Suckow) Borkh" . Plants of the World Online . Royal Botanic Gardens, Kew . Retrieved 31 July 2024 .
+
+^ Jump up to: a b Lim, Lisa (6 July 2021). "Where the word 'apple' came from and why the forbidden fruit was unlucky to be linked with the fall of man" . Language Matters. South China Morning Post . Hong Kong, China: Alibaba Group . Archived from the original on 28 June 2023. Retrieved 28 June 2023 .
+
+^ "Origin and meaning of "apple" by Online Etymology Dictionary" . Online Etymology Dictionary . Archived from the original on 21 December 2019. Retrieved 22 November 2019 .
+
+^ Jump up to: a b c d e f g Rieger, Mark. "Apple - Malus domestica " . HORT 3020: Intro Fruit Crops . University of Georgia . Archived from the original on 21 January 2008. Retrieved 22 January 2008 .
+
+^ Jump up to: a b c "Apples - Malus domestica " . North Carolina Extension Gardener Plant Toolbox . North Carolina State University . Archived from the original on 31 May 2024. Retrieved 31 July 2024 .
+
+^ Jump up to: a b c d Heil, Kenneth D.; O'Kane, Jr., Steve L.; Reeves, Linda Mary; Clifford, Arnold (2013). Flora of the Four Corners Region: Vascular Plants of the San Juan River Drainage, Arizona, Colorado, New Mexico, and Utah (First ed.). St. Louis, Missouri: Missouri Botanical Garden . p. 909. ISBN 978-1-930723-84-9 . ISSN 0161-1542 . LCCN 2012949654 . OCLC 859541992 . Retrieved 27 July 2024 .
+
+^ Jump up to: a b c d e Lim, Tong Kwee (2012). "Malus x domestica ". Edible Medicinal and Non-Medicinal Plants . Vol. 4, Fruit (First ed.). Dordrecht, the Netherlands: Springer . pp. 414–415. doi :10.1007/978-94-007-4053-2_49 . ISBN 978-94-007-4053-2 . OCLC 795503871 .
+
+^ Juniper, Barrie E. ; Mabberley, David J. (2006). The Story of the Apple (First ed.). Portland, Oregon: Timber Press . p. 27. ISBN 978-0-88192-784-9 . LCCN 2006011869 . OCLC 67383484 . Retrieved 1 August 2024 .
+
+^ "Fruit glossary" . Royal Horticultural Society. Archived from the original on 7 August 2024. Retrieved 7 August 2024 .
+
+^ Burford, Tom (2013). Apples of North America : 192 Exceptional Varieties for Gardeners, Growers and Cooks (First ed.). Portland, Oregon: Timber Press . pp. 22, 50, 55, 122, 123, 137, 141, 147, 159, 245, 246. ISBN 978-1-60469-249-5 . LCCN 2012045130 . OCLC 819860825 .
+
+^ "Shape" . Western Agricultural Research Center . Montana State University . Archived from the original on 23 April 2024. Retrieved 30 July 2024 .
+
+^ Jump up to: a b Janick, Jules; Cummins, James N.; Brown, Susan K.; Hemmat, Minou (1996). "Chapter 1: Apples" (PDF) . Fruit Breeding . Vol. I: Tree and Tropical Fruits. New York: John Wiley & Sons . pp. 9, 48. ISBN 978-0-471-31014-3 . LCCN 95016407 . OCLC 1302621533 . Archived (PDF) from the original on 19 July 2013. Retrieved 30 August 2024 .
+
+^ "Natural Waxes on Fruits" . Postharvest.tfrec.wsu.edu. 29 October 2010. Archived from the original on 24 May 2013. Retrieved 14 June 2013 .
+
+^ Flath, R. A.; Black, D. R.; Forrey, R. R.; McDonald, G. M.; Mon, T. R.; Teranishi, R. (1 August 1969). "Volatiles in Gravenstein Apple Essence Identified by GC-Mass Spectrometry". Journal of Chromatographic Science . 7 (8): 508. doi :10.1093/CHROMSCI/7.8.508 .
+
+^ Flath, Robert A.; Black, Dale Robert.; Guadagni, Dante G.; McFadden, William H.; Schultz, Thomas H. (January 1967). "Identification and organoleptic evaluation of compounds in Delicious apple essence". Journal of Agricultural and Food Chemistry . 15 (1): 29. doi :10.1021/jf60149a032 .
+
+^ Jump up to: a b Qian, Guan-Ze; Liu, Lian-Fen; Tang, Geng-Guo (April 2010). "(1933) Proposal to conserve the name Malus domestica against M. pumila , M. communis , M. frutescens , and Pyrus dioica ( Rosaceae )". Taxon . 59 (2): 650–652. doi :10.1002/tax.592038 .
+
+^ Applequist, Wendy L. (2017). "Report of the Nomenclature Committee for Vascular Plants: 69" (PDF) . Taxon . 66 (2): 500–513. doi :10.12705/662.17 . Archived (PDF) from the original on 7 May 2024.
+
+^ Wilson, Karen L. (June 2017). "Report of the General Committee: 18" . Taxon . 66 (3): 742. doi :10.12705/663.15 .
+
+^ Jump up to: a b Velasco, Riccardo; Zharkikh, Andrey; Affourtit, Jason; Dhingra, Amit; Cestaro, Alessandro; et al. (2010). "The genome of the domesticated apple (Malus × domestica Borkh.)" . Nature Genetics . 42 (10): 833–839. doi :10.1038/ng.654 . PMID 20802477 . S2CID 14854514 .
+
+^ Di Pierro, Erica A.; Gianfranceschi, Luca; Di Guardo, Mario; Koehorst-Van Putten, Herma J.J.; Kruisselbrink, Johannes W.; et al. (2016). "A high-density, multi-parental SNP genetic map on apple validates a new mapping approach for outcrossing species" . Horticulture Research . 3 (1): 16057. Bibcode :2016HorR....316057D . doi :10.1038/hortres.2016.57 . PMC 5120355 . PMID 27917289 .
+
+^ Jump up to: a b Daccord, Nicolas; Celton, Jean-Marc; Linsmith, Gareth; et al. (2017). "High-quality de novo assembly of the apple genome and methylome dynamics of early fruit development" . Nature Genetics . 49 (7). Nature Communications: 1099–1106. doi :10.1038/ng.3886 . hdl :10449/42064 . PMID 28581499 . S2CID 24690391 .
+
+^ Jump up to: a b Zhang, Liyi; Hu, Jiang; Han, Xiaolei; Li, Jingjing; Gao, Yuan; et al. (2019). "A high-quality apple genome assembly reveals the association of a retrotransposon and red fruit colour" . Nature Communications . 10 (1). Nature Genetics: 1494. Bibcode :2019NatCo..10.1494Z . doi :10.1038/s41467-019-09518-x . PMC 6445120 . PMID 30940818 .
+
+^ Jump up to: a b c d e Duan, Naibin; Bai, Yang; Sun, Honghe; Wang, Nan; Ma, Yumin; et al. (2017). "Genome re-sequencing reveals the history of apple and supports a two-stage model for fruit enlargement" . Nature Communications . 8 (1): 249. Bibcode :2017NatCo...8..249D . doi :10.1038/s41467-017-00336-7 . PMC 5557836 . PMID 28811498 .
+
+^ Richards, Christopher M.; Volk, Gayle M.; Reilley, Ann A.; Henk, Adam D.; Lockwood, Dale R.; et al. (2009). "Genetic diversity and population structure in Malus sieversii , a wild progenitor species of domesticated apple". Tree Genetics & Genomes . 5 (2): 339–347. doi :10.1007/s11295-008-0190-9 . S2CID 19847067 .
+
+^ Lauri, Pierre-éric; Maguylo, Karen; Trottier, Catherine (March 2006). "Architecture and size relations: an essay on the apple (Malus × domestica, Rosaceae) tree" . American Journal of Botany . 93 (3): 357–368. doi :10.3732/ajb.93.3.357 . PMID 21646196 . Archived from the original on 20 April 2019. Retrieved 27 July 2024 .
+
+^ Cornille, Amandine; Gladieux, Pierre; Smulders, Marinus J. M.; Roldán-Ruiz, Isabel; Laurens, François; et al. (2012). Mauricio, Rodney (ed.). "New Insight into the History of Domesticated Apple: Secondary Contribution of the European Wild Apple to the Genome of Cultivated Varieties" . PLOS Genetics . 8 (5): e1002703. doi :10.1371/journal.pgen.1002703 . PMC 3349737 . PMID 22589740 .
+
+^ Kean, Sam (17 May 2012). "ScienceShot: The Secret History of the Domesticated Apple" . Archived from the original on 11 June 2016.
+
+^ Coart, E.; Van Glabeke, S.; De Loose, M.; Larsen, A.S.; Roldán-Ruiz, I. (2006). "Chloroplast diversity in the genus Malus : new insights into the relationship between the European wild apple (Malus sylvestris (L.) Mill.) and the domesticated apple (Malus domestica Borkh.)". Mol. Ecol . 15 (8): 2171–2182. Bibcode :2006MolEc..15.2171C . doi :10.1111/j.1365-294x.2006.02924.x . PMID 16780433 . S2CID 31481730 .
+
+^ Rottoli, Mauro; Pessina, Andrea (2007). "Chapter 9: Neolithic agriculture in Italy: an update of archaeobotanical data with particular emphasis on northern settlements". In Colledge, Sue; Conolly, James (eds.). The Origins and Spread of Domestic Plants in Southwest Asia and Europe (First ed.). Walnut Creek, California: Left Coast Press ; University College London Institute of Archaeology Publications. pp. 142–143. ISBN 978-1-59874-988-5 . OCLC 84838157 .
+
+^ Jump up to: a b c d Schlumbaum, Angela; van Glabeke, Sabine; Roldan-Ruiz, Isabel (January 2012). "Towards the onset of fruit tree growing north of the Alps: Ancient DNA from waterlogged apple (Malus sp.) seed fragments". Annals of Anatomy - Anatomischer Anzeiger . 194 (1): 157–162. doi :10.1016/j.aanat.2011.03.004 . PMID 21501956 .
+
+^ Sauer, Jonathan D. (1993). Historical Geography of Crop Plants: A Select Roster (First ed.). Boca Raton, Florida: CRC Press . pp. 109–113. ISBN 978-0-8493-8901-6 . LCCN 92045590 . OCLC 27224696 .
+
+^ Plinius, Gaius Secundus (1855). The Natural History of Pliny . Vol. III. Translated by Bostock, John ; Riley, Henry T. London: Henry G. Bohn . p. 303. Retrieved 3 August 2024 .
+
+^ Martin, Alice A. (1976). All About Apples (First ed.). Boston, Massachusetts: Houghton Mifflin Company . pp. 64–65. ISBN 978-0-395-20724-6 . OCLC 1733691 . Retrieved 3 August 2024 .
+
+^ Adamson, Melitta Weiss (2004). Food in Medieval Times (First ed.). Westport, Connecticut: Greenwood Press . pp. 19–20. ISBN 978-0-313-32147-4 . LCCN 2004014054 . OCLC 55738647 .
+
+^ Torrejón, Fernando; Cisternas, Marco; Araneda, Alberto (2004). "Efectos ambientales de la colonización española desde el río Maullín al archipiélago de Chiloé, sur de Chile" [Environmental effects of the spanish colonization from de Maullín river to the Chiloé archipelago, southern Chile]. Revista Chilena de Historia Natural (in Spanish). 77 (4): 661–677. doi :10.4067/s0716-078x2004000400009 .
+
+^ Smith, Archibald William (1963). A Gardener's Book of Plant Names : A Handbook of the Meaning and Origins of Plant Names (First ed.). New York: Harper & Row . p. 40. LCCN 62009906 . OCLC 710612 . Retrieved 10 August 2024 .
+
+^ Jump up to: a b c Poole, Mike (1980). "Heirloom Apples" . In Lawrence, James (ed.). The Harrowsmith Reader Volume II . Camden East, Ontario: Camden House Publishing . p. 122. ISBN 978-0-920656-11-2 . OCLC 1336124440 . Retrieved 10 August 2024 .
+
+^ Van Valen, James M. (1900). History of Bergen County, New Jersey . New York: New Jersey Publishing and Engraving Company. pp. 33–34. OCLC 25697876 . Retrieved 9 August 2024 .
+
+^ Brox, Jane (1999). Five Thousand Days Like This One (First ed.). Boston, Massachusetts: Beacon Press. pp. 150–151. ISBN 978-0-8070-2106-4 . LCCN 98035051 . OCLC 39605684 . Retrieved 9 August 2024 .
+
+^ Cohen, Rachel D. (26 November 2018). "Thanks To Science, You Can Eat An Apple Every Day" . The Salt . NPR . Archived from the original on 18 June 2024. Retrieved 1 August 2024 .
+
+^ "The Heirloom Apple Orchard" . The Jentsch Lab . Cornell University . Archived from the original on 30 July 2024. Retrieved 9 August 2024 .
+
+^ Ranney, Thomas G. "Polyploidy: From Evolution to Landscape Plant Improvement" . Proceedings of the 11th Metropolitan Tree Improvement Alliance (METRIA) Conference . 11th Metropolitan Tree Improvement Alliance Conference held in Gresham, Oregon, August 23–24, 2000 . METRIA (NCSU.edu) . METRIA. Archived from the original on 23 July 2010. Retrieved 7 November 2010 .
+
+^ Lord, William G.; Ouellette, Amy (February 2010). "Dwarf Rootstocks for Apple Trees in the Home Garden" (PDF) . University of New Hampshire . Archived from the original (PDF) on 30 September 2013. Retrieved 1 September 2013 .
+
+^ Fallahi, Esmaeil; Colt, W. Michael; Fallahi, Bahar; Chun, Ik-Jo (January 2002). "The Importance of Apple Rootstocks on Tree Growth, Yield, Fruit Quality, Leaf Nutrition, and Photosynthesis with an Emphasis on 'Fuji' " . HortTechnology . 12 (1): 38–44. doi :10.21273/HORTTECH.12.1.38 . Archived (PDF) from the original on 11 February 2014. Retrieved 9 August 2024 .
+
+^ Parker, M.L. (September 1993). "Apple Rootstocks and Tree Spacing" . North Carolina Cooperative Extension Service . Archived from the original on 11 September 2013. Retrieved 1 September 2013 .
+
+^ Ferree, David Curtis; Warrington, Ian J. (2003). Apples: Botany, Production, and Uses . New York: Centre for Agriculture and Bioscience International. pp. 33–35. ISBN 978-0851995922 . OCLC 133167834 .
+
+^ Jump up to: a b c d Polomski, Bob; Reighard, Greg. "Apple HGIC 1350" . Home & Garden Information Center . Clemson University . Archived from the original on 28 February 2008. Retrieved 22 January 2008 .
+
+^ Barahona, M. (1992). "Adaptation of Apple Varieties in Ecuador". Acta Horticulturae (310): 135–142. doi :10.17660/ActaHortic.1992.310.17 .
+
+^ Adamson, Nancy Lee (2011). An Assessment of Non-Apis Bees as Fruit and Vegetable Crop Pollinators in Southwest Virginia (PDF) (Doctor of Philosophy in Entomology thesis). Virginia Polytechnic Institute and State University . Archived (PDF) from the original on 20 November 2015. Retrieved 15 October 2015 .
+
+^ Powell, L.E. (1986). "The Chilling Requirement in Apple and Its Role in Regulating Time of Flowering in Spring in Cold-Winter Climate". Acta Horticulturae (179). Wageningen, Netherlands: International Society for Horticultural Science : 129–140. doi :10.17660/ActaHortic.1986.179.10 . ISBN 978-90-6605-182-9 .
+
+^ Romano, Andrea (10 September 2023). "20 Best Places to Go Apple Picking in the United States" . Travel + Leisure . Archived from the original on 21 April 2024. Retrieved 2 August 2024 .
+
+^ Graziano, Jack; Farcuh, Macarena (10 September 2021). "Controlled Atmosphere Storage of Apples" . University of Maryland Extension . Archived from the original on 24 March 2023. Retrieved 2 August 2024 .
+
+^ "FoodKeeper App" . FoodSafety.gov . United States Department of Health and Human Services . 26 April 2019. Retrieved 17 September 2024 .
+
+^ "4 Steps to Food Safety" . FoodSafety.gov . United States Department of Health and Human Services . 12 April 2019. Retrieved 17 September 2024 .
+
+^ "Refrigerated storage of perishable foods" . CSIRO . 26 February 2015. Archived from the original on 15 March 2015. Retrieved 25 May 2007 .
+
+^ Karp, David (25 October 2006). "Puff the Magic Preservative: Lasting Crunch, but Less Scent" . The New York Times . Archived from the original on 3 August 2011. Retrieved 26 July 2017 .
+
+^ Jackson, H.S. (1914). "Powdery Mildew" . In Lowther, Granville; Worthington, William (eds.). The Encyclopedia of Practical Horticulture: A Reference System of Commercial Horticulture, Covering the Practical and Scientific Phases of Horticulture, with Special Reference to Fruits and Vegetables . Vol. I. North Yakima, Washington: The Encyclopedia of Horticulture Corporation. pp. 475–476. Retrieved 1 August 2024 .
+
+^ Lowther, Granville; Worthington, William, eds. (1914). The Encyclopedia of Practical Horticulture: A Reference System of Commercial Horticulture, Covering the Practical and Scientific Phases of Horticulture, with Special Reference to Fruits and Vegetables . Vol. I. North Yakima, Washington: The Encyclopedia of Horticulture Corporation. pp. 45–51. Retrieved 1 August 2024 .
+
+^ Coli, William M.; Los, Lorraine M., eds. (2003). "Insect Pests" . 2003-2004 New England Apple Pest Management Guide . University of Massachusetts Amherst . pp. 28–29. Archived from the original on 12 February 2008. Retrieved 3 March 2008 . {{cite book }}
: CS1 maint: bot: original URL status unknown (link )
+
+^ Jump up to: a b Atthowe, Helen; Gilkeson, Linda A.; Kite, L. Patricia; Michalak, Patricia S.; Pleasant, Barbara; Reich, Lee; Scheider, Alfred F. (2009). Bradley, Fern Marshall; Ellis, Bardara W.; Martin, Deborah L. (eds.). The Organic Gardener's Handbook of Natural Pest and Disease Control . New York: Rodale, Inc. pp. 32–34. ISBN 978-1-60529-677-7 . LCCN 2009039996 . OCLC 419860680 .
+
+^ Coli, William M.; Berkett, Lorraine P.; Spitko, Robin, eds. (2003). "Other Apple Diseases" . 2003-2004 New England Apple Pest Management Guide . University of Massachusetts Amherst . pp. 19–27. Archived from the original on 12 February 2008. Retrieved 3 March 2008 . {{cite book }}
: CS1 maint: bot: original URL status unknown (link )
+
+^ Martin, Phillip L.; Krawczyk, Teresa; Khodadadi, Fatemeh; Aćimović, Srđan G.; Peter, Kari A. (2021). "Bitter Rot of Apple in the Mid-Atlantic United States: Causal Species and Evaluation of the Impacts of Regional Weather Patterns and Cultivar Susceptibility" . Phytopathology . 111 (6): 966–981. doi :10.1094/PHYTO-09-20-0432-R . ISSN 0031-949X . PMID 33487025 . S2CID 231701083 .
+
+^ Erler, Fedai (1 January 2010). "Efficacy of tree trunk coating materials in the control of the apple clearwing, Synanthedon myopaeformis" . Journal of Insect Science . 10 (1): 63. doi :10.1673/031.010.6301 . PMC 3014806 . PMID 20672979 .
+
+^ Elzebroek, A. T. G.; Wind, Koop (2008). Guide to Cultivated Plants . Wallingford, United Kingdom: CABI . p. 27. ISBN 978-1-84593-356-2 . LCCN 2007028459 . OCLC 156975183 . Archived from the original on 20 October 2020. Retrieved 6 October 2020 .
+
+^ Jump up to: a b "Apple – Malus domestica " . Natural England . Archived from the original on 12 May 2008. Retrieved 22 January 2008 .
+
+^ "Home" . National Fruit Collection . Archived from the original on 15 June 2012. Retrieved 2 December 2012 .
+
+^ "ECPGR Malus/Pyrus Working Group Members" . Ecpgr.cgiar.org . 22 July 2002. Archived from the original on 26 August 2014. Retrieved 25 August 2014 .
+
+^ Jump up to: a b Tarjan, Sue (Fall 2006). "Autumn Apple Musings" (PDF) . News & Notes of the UCSC Farm & Garden, Center for Agroecology & Sustainable Food Systems. pp. 1–2. Archived from the original (PDF) on 11 August 2007. Retrieved 24 January 2008 .
+
+^ Beck, Kellen (17 October 2020). "How breeders bring out the best in new apples" . Mashable . Archived from the original on 31 July 2024. Retrieved 31 July 2024 .
+
+^ Migicovsky, Zoë (22 August 2021). "How a few good apples spawned today's top varieties — and why breeders must branch out" . The Conversation . Archived from the original on 31 July 2024. Retrieved 31 July 2024 .
+
+^ Peil, A.; Dunemann, F.; Richter, K.; Hoefer, M.; Király, I.; Flachowsky, H.; Hanke, M.-V. (2008). "Resistance Breeding in Apple at Dresden-Pillnitz" . Ecofruit - 13th International Conference on Cultivation Technique and Phytopathological Problems in Organic Fruit-Growing: Proceedings to the Conference from 18thFebruary to 20th February 2008 at Weinsberg/Germany (in German): 220–225. Archived from the original on 28 January 2021. Retrieved 31 July 2024 .
+
+^ Jump up to: a b "World apple situation" . Archived from the original on 11 February 2008. Retrieved 24 January 2008 .
+
+^ Weaver, Sue (June–July 2003). "Crops & Gardening – Apples of Antiquity" . Hobby Farms Magazine . Archived from the original on 19 February 2017.
+
+^ Jump up to: a b c "Apple production in 2022; from pick lists: Crops/World Regions/Production Quantity" . FAOSTAT, UN Food and Agriculture Organization , Statistics Division. 2024. Archived from the original on 12 November 2016. Retrieved 18 June 2024 .
+
+^ Nelson, Lewis S.; Shih, Richard D.; Balick, Michael J. (2007). Handbook of Poisonous and Injurious Plants (Second ed.). New York: New York Botanical Garden : Springer . pp. 27, 211–212. ISBN 978-0387-31268-2 . LCCN 2005938815 . OCLC 77537459 . Retrieved 11 September 2024 .
+
+^ "Amygdalin" . Toxnet, US Library of Medicine. Archived from the original on 21 April 2017. Retrieved 20 April 2017 .
+
+^ Jump up to: a b c d e f "General Information – Apple" . Informall. Archived from the original on 23 July 2012. Retrieved 17 October 2011 .
+
+^ Landau, Elizabeth, Oral allergy syndrome may explain mysterious reactions , 8 April 2009, CNN Health , accessed 17 October 2011
+
+^ United States Food and Drug Administration (2024). "Daily Value on the Nutrition and Supplement Facts Labels" . FDA . Archived from the original on 27 March 2024. Retrieved 28 March 2024 .
+
+^ National Academies of Sciences, Engineering, and Medicine; Health and Medicine Division; Food and Nutrition Board; Committee to Review the Dietary Reference Intakes for Sodium and Potassium (2019). Oria, Maria; Harrison, Meghan; Stallings, Virginia A. (eds.). Dietary Reference Intakes for Sodium and Potassium . The National Academies Collection: Reports funded by National Institutes of Health. Washington, DC: National Academies Press (US). ISBN 978-0-309-48834-1 . PMID 30844154 . Archived from the original on 9 May 2024. Retrieved 21 June 2024 .
+
+^ Jump up to: a b c d Davidson, Alan (2014). "Apple" . In Jaine, Tom (ed.). The Oxford Companion to Food . Illustrated by Soun Vannithone (Third ed.). Oxford: Oxford University Press . pp. 27–31. ISBN 978-0-19-967733-7 . LCCN 2013957569 . OCLC 890807357 . OL 27172691M . Retrieved 18 September 2024 .
+
+^ Traverso, Amy (2011). The Apple Lover's Cookbook . Photographs by Squire Fox (First ed.). New York: W.W. Norton & Company . pp. 16, 32, 35, 45, 92, 137, 262–263, 275. ISBN 978-0-393-06599-2 . LCCN 2011016560 . OCLC 711051767 . OL 16450839W .
+
+^ Kellogg, Kristi (15 January 2015). "81 Best Apple Recipes: Dinners, Desserts, Salads, and More" . Epicurious . Archived from the original on 18 October 2020. Retrieved 17 October 2020 .
+
+^ Davidson, Alan (2014). "Toffee Apple" . In Jaine, Tom (ed.). The Oxford Companion to Food . Illustrated by Soun Vannithone (Third ed.). Oxford: Oxford University Press . p. 824. ISBN 978-0-19-967733-7 . LCCN 2013957569 . OCLC 890807357 . OL 27172691M . Retrieved 18 September 2024 .
+
+^ Shurpin, Yehuda. "Why All the Symbolic Rosh Hashanah Foods? "בולבול" " . Chabad.org . Archived from the original on 21 March 2023. Retrieved 21 March 2023 .
+
+^ Yepsen, Roger B. (2017) [1994]. Apples (Revised and Updated ed.). New York: W.W. Norton & Company. p. 52. ISBN 978-1-68268-019-3 . LCCN 2017010136 . OCLC 973918728 .
+
+^ "Organic apples" . USDA Agricultural Marketing Service. February 2016. Archived from the original on 24 February 2017. Retrieved 23 February 2017 .
+
+^ Jump up to: a b "European Organic Apple Production Demonstrates the Value of Pesticides" (PDF) . CropLife Foundation, Washington, DC. December 2011. Archived (PDF) from the original on 24 February 2017. Retrieved 23 February 2017 .
+
+^ Ribeiro, Flávia A.P.; Gomes de Moura, Carolina F.; Aguiar, Odair; de Oliveira, Flavia; Spadari, Regina C.; Oliveira, Nara R.C.; Oshima, Celina T.F.; Ribeiro, Daniel A. (September 2014). "The chemopreventive activity of apple against carcinogenesis: antioxidant activity and cell cycle control". European Journal of Cancer Prevention (Review). 23 (5): 477–480. doi :10.1097/CEJ.0000000000000005 . PMID 24366437 . S2CID 23026644 .
+
+^ Nicolas, J. J.; Richard-Forget, F. C.; Goupy, P. M.; Amiot, M. J.; Aubert, S. Y. (1 January 1994). "Enzymatic browning reactions in apple and apple products". Critical Reviews in Food Science and Nutrition . 34 (2): 109–157. doi :10.1080/10408399409527653 . PMID 8011143 .
+
+^ "PPO silencing" . Okanagan Specialty Fruits. 2019. Archived from the original on 27 April 2021. Retrieved 14 November 2019 .
+
+^ "United States: GM non-browning Arctic apple expands into foodservice" . Fresh Fruit Portal. 13 August 2019. Archived from the original on 27 June 2021. Retrieved 14 November 2019 .
+
+^ "Okanagan Specialty Fruits: Biotechnology Consultation Agency Response Letter BNF 000132" . U.S. Food and Drug Administration . 20 March 2015. Archived from the original on 31 October 2017. Retrieved 14 November 2019 .
+
+^ "Questions and answers: Arctic Apple" . Canadian Food Inspection Agency, Government of Canada. 8 September 2017. Archived from the original on 19 September 2018. Retrieved 14 November 2019 .
+
+^ Yu, Xiuzhu; Van De Voort, Frederick R.; Li, Zhixi; Yue, Tianli (2007). "Proximate Composition of the Apple Seed and Characterization of Its Oil". International Journal of Food Engineering . 3 (5). doi :10.2202/1556-3758.1283 . S2CID 98590230 .
+
+^ Jump up to: a b c Davidson, Hilda Roderick Ellis (1990) [1st pub. 1964]. Gods and Myths of Northern Europe . London: Penguin Books . pp. 165–166. ISBN 0-14-013627-4 . OCLC 29336401 .
+
+^ Davidson, Hilda Ellis (1998). Roles of the Northern Goddess . London; New York: Routledge . pp. 146–147. doi :10.4324/9780203025550 . ISBN 0-415-13610-5 . LCCN 97018309 . OCLC 48138055 .
+
+^ Biedermann, Hans (1992). Dictionary of Symbolism . Translated by Hulbert, James. New York: Facts on File. pp. 16–17. ISBN 978-0-8160-2593-0 . LCCN 91044933 . OCLC 25092926 . Retrieved 3 October 2024 .
+
+^ Jump up to: a b Ruck, Carl A. P. ; Staples, Blaise D. ; Heinrich, Clark (2001). The apples of Apollo : pagan and Christian mysteries of the Eucharist . Durham, North Carolina: Carolina Academic Press . pp. 64–70. ISBN 978-0-89089-924-3 . LCCN 00040351 . OCLC 46337324 .
+
+^ "Eris - Greek Goddess of Strife & Discord (Roman Discordia)" . Theoi Project . Aaron J. Atsma. Archived from the original on 25 September 2024. Retrieved 26 September 2024 .
+
+^ Lucian (1905). The Works of Lucian of Samosata . Vol. I. Translated by Fowler, H.W. ; Fowler, F.G. (First ed.). Oxford: Clarendon Press . pp. 78–85. LCCN 06001045 . OCLC 506365 . Retrieved 26 September 2024 .
+
+^ "Judgement of Paris - Greek Mythology" . Theoi Project . Aaron J. Atsma. Archived from the original on 24 August 2024. Retrieved 26 September 2024 .
+
+^ Plato (1997). "Epigrams" . In Cooper, John M.; Hutchinson, D.S. (eds.). Complete Works . Translated by Edmonds, J.M.; Cooper, John M. Indianapolis, Indiana: Hackett Publishing . p. 1744. ISBN 0-87220-349-2 . LCCN 96053280 . OCLC 36178550 . Retrieved 27 September 2024 .
+
+^ Pinsent, John (1969). Greek Mythology (First ed.). London: Paul Hamlyn. p. 79. ISBN 978-0-600-02422-4 . LCCN 78449216 . OCLC 61702 . Retrieved 3 October 2024 .
+
+^ "Atalanta (Atalante) - Arcadian Heroine of Greek Mythology" . Theoi Project . Aaron J. Atsma. Archived from the original on 27 September 2024. Retrieved 3 October 2024 .
+
+^ Flieger, Verlyn (2005). Interrupted Music : The Making of Tolkien's Mythology . Kent, Ohio: Kent State University Press . pp. 122–123. ISBN 978-0-87338-824-5 . LCCN 2004024490 . OCLC 56805947 .
+
+^ Jump up to: a b "Why Do the Chinese Give Apples Around Christmas?" . Teach English In China . 22 December 2019. Archived from the original on 1 October 2020. Retrieved 3 September 2024 .
+
+^ Jump up to: a b Macrone, Michael (1998). Brush up your Bible! . New York: Gramercy Books . pp. 15–16, 340–341. ISBN 978-0-517-20189-3 . OCLC 38270894 . Retrieved 31 July 2024 .
+
+^ Kissling, Paul J. (2004). Genesis . Vol. 1. Joplin, Missouri: College Press. p. 193. ISBN 978-0-89900-875-2 . LCCN 2004022577 . OCLC 56672257 . Archived from the original on 26 January 2021. Retrieved 6 October 2020 .
+
+^ Genesis 2:17
+
+^ Hendel, Ronald S. (2013). The Book of Genesis: A Biography . Princeton, New Jersey: Princeton University Press . p. 114. ISBN 978-0-69114012-4 . LCCN 2012015634 . OCLC 788265521 . Archived from the original on 5 March 2023. Retrieved 4 October 2024 .
+
+^ Mieder, Wolfgang ; Kingsbury, Stewart A.; Harder, Kelsie B. , eds. (1996) [1992]. A Dictionary of American Proverbs (Paperback ed.). New York: Oxford University Press. p. 23. ISBN 978-0-19-511133-0 . LCCN 91015508 . OCLC 23693799 . Retrieved 23 August 2024 .
+
+^ Pollan, Michael (2001). The Botany of Desire: A Plant's-Eye View of the World (First ed.). New York: Random House . pp. 9, 22, 50. ISBN 978-0-375-50129-6 . LCCN 00066479 . OCLC 49803415 .
+
+
+
Further reading
+
Browning, Frank (1998). Apples (First ed.). New York: North Point Press . ISBN 978-0-86547-537-3 . LCCN 98027252 . OCLC 39235786 .
+Hanson, Beth; Marinelli, Janet; Saphire, Sigrun Wolff; Tebbitt, Mark, eds. (2003). The Best Apples to Buy and Grow (First ed.). Brooklyn, New York: Brooklyn Botanic Garden . ISBN 978-1-889538-66-2 . OCLC 60384060 .
+Juniper, Barrie E. ; Mabberley, David J. (2006). The Story of the Apple (First ed.). Portland, Oregon: Timber Press . ISBN 978-0-88192-784-9 . LCCN 2006011869 . OCLC 67383484 .
+Phillips, Michael (1998). The Apple Grower : A Guide for the Organic Orchardist (First ed.). White River Junction, Vermont: Chelsea Green Publishing . ISBN 978-1-890132-04-0 . LCCN 98003631 . OCLC 38731995 .
+Sanders, Rosie (2010). The Apple Book (Second ed.). London: Frances Lincoln Limited . ISBN 9780711231412 . OCLC 646397065 .
+
External links
+
Media related to Apples at Wikimedia Commons
+
+
+
Malus domestica Malus pumila
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/async/test_0.4.2_browser_manager.py b/tests/async/test_0.4.2_browser_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bb195822dabbcadcf546fdffeaaee58ddd3bd7e
--- /dev/null
+++ b/tests/async/test_0.4.2_browser_manager.py
@@ -0,0 +1,153 @@
+import os, sys
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+__location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+import os, sys
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+# Assuming that the changes made allow different configurations
+# for managed browser, persistent context, and so forth.
+
+async def test_default_headless():
+ async with AsyncWebCrawler(
+ headless=True,
+ verbose=True,
+ user_agent_mode="random",
+ user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
+ use_managed_browser=False,
+ use_persistent_context=False,
+ ignore_https_errors=True,
+ # Testing normal ephemeral context
+ ) as crawler:
+ result = await crawler.arun(
+ url='https://www.kidocode.com/degrees/technology',
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
+ )
+ print("[test_default_headless] success:", result.success)
+ print("HTML length:", len(result.html if result.html else ""))
+
+async def test_managed_browser_persistent():
+ # Treating use_persistent_context=True as managed_browser scenario.
+ async with AsyncWebCrawler(
+ headless=False,
+ verbose=True,
+ user_agent_mode="random",
+ user_agent_generator_config={"device_type": "desktop", "os_type": "mac"},
+ use_managed_browser=True,
+ use_persistent_context=True, # now should behave same as managed browser
+ user_data_dir="./outpu/test_profile",
+ # This should store and reuse profile data across runs
+ ) as crawler:
+ result = await crawler.arun(
+ url='https://www.google.com',
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+ )
+ print("[test_managed_browser_persistent] success:", result.success)
+ print("HTML length:", len(result.html if result.html else ""))
+
+async def test_session_reuse():
+ # Test creating a session, using it for multiple calls
+ session_id = "my_session"
+ async with AsyncWebCrawler(
+ headless=False,
+ verbose=True,
+ user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
+ # Fixed user-agent for consistency
+ use_managed_browser=False,
+ use_persistent_context=False,
+ ) as crawler:
+
+ # First call: create session
+ result1 = await crawler.arun(
+ url='https://www.example.com',
+ cache_mode=CacheMode.BYPASS,
+ session_id=session_id,
+ markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+ )
+ print("[test_session_reuse first call] success:", result1.success)
+
+ # Second call: same session, possibly cookie retained
+ result2 = await crawler.arun(
+ url='https://www.example.com/about',
+ cache_mode=CacheMode.BYPASS,
+ session_id=session_id,
+ markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+ )
+ print("[test_session_reuse second call] success:", result2.success)
+
+async def test_magic_mode():
+ # Test magic mode with override_navigator and simulate_user
+ async with AsyncWebCrawler(
+ headless=False,
+ verbose=True,
+ user_agent_mode="random",
+ user_agent_generator_config={"device_type": "desktop", "os_type": "windows"},
+ use_managed_browser=False,
+ use_persistent_context=False,
+ magic=True,
+ override_navigator=True,
+ simulate_user=True,
+ ) as crawler:
+ result = await crawler.arun(
+ url='https://www.kidocode.com/degrees/business',
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+ )
+ print("[test_magic_mode] success:", result.success)
+ print("HTML length:", len(result.html if result.html else ""))
+
+async def test_proxy_settings():
+ # Test with a proxy (if available) to ensure code runs with proxy
+ async with AsyncWebCrawler(
+ headless=True,
+ verbose=False,
+ user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
+ proxy="http://127.0.0.1:8080", # Assuming local proxy server for test
+ use_managed_browser=False,
+ use_persistent_context=False,
+ ) as crawler:
+ result = await crawler.arun(
+ url='https://httpbin.org/ip',
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+ )
+ print("[test_proxy_settings] success:", result.success)
+ if result.success:
+ print("HTML preview:", result.html[:200] if result.html else "")
+
+async def test_ignore_https_errors():
+ # Test ignore HTTPS errors with a self-signed or invalid cert domain
+ # This is just conceptual, the domain should be one that triggers SSL error.
+ # Using a hypothetical URL that fails SSL:
+ async with AsyncWebCrawler(
+ headless=True,
+ verbose=True,
+ user_agent="Mozilla/5.0",
+ ignore_https_errors=True,
+ use_managed_browser=False,
+ use_persistent_context=False,
+ ) as crawler:
+ result = await crawler.arun(
+ url='https://self-signed.badssl.com/',
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+ )
+ print("[test_ignore_https_errors] success:", result.success)
+
+async def main():
+ print("Running tests...")
+ # await test_default_headless()
+ # await test_managed_browser_persistent()
+ # await test_session_reuse()
+ # await test_magic_mode()
+ # await test_proxy_settings()
+ await test_ignore_https_errors()
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/tests/async/test_0.4.2_config_params.py b/tests/async/test_0.4.2_config_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..623ac3ab7823f0b924ab5fbad9dfdc40bee65696
--- /dev/null
+++ b/tests/async/test_0.4.2_config_params.py
@@ -0,0 +1,231 @@
+import os, sys
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai.chunking_strategy import RegexChunking
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+# Category 1: Browser Configuration Tests
+async def test_browser_config_object():
+ """Test the new BrowserConfig object with various browser settings"""
+ browser_config = BrowserConfig(
+ browser_type="chromium",
+ headless=False,
+ viewport_width=1920,
+ viewport_height=1080,
+ use_managed_browser=True,
+ user_agent_mode="random",
+ user_agent_generator_config={"device_type": "desktop", "os_type": "windows"}
+ )
+
+ async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
+ result = await crawler.arun('https://example.com', cache_mode=CacheMode.BYPASS)
+ assert result.success, "Browser config crawl failed"
+ assert len(result.html) > 0, "No HTML content retrieved"
+
+async def test_browser_performance_config():
+ """Test browser configurations focused on performance"""
+ browser_config = BrowserConfig(
+ text_mode=True,
+ light_mode=True,
+ extra_args=['--disable-gpu', '--disable-software-rasterizer'],
+ ignore_https_errors=True,
+ java_script_enabled=False
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun('https://example.com')
+ assert result.success, "Performance optimized crawl failed"
+ assert result.status_code == 200, "Unexpected status code"
+
+# Category 2: Content Processing Tests
+async def test_content_extraction_config():
+ """Test content extraction with various strategies"""
+ crawler_config = CrawlerRunConfig(
+ word_count_threshold=300,
+ extraction_strategy=JsonCssExtractionStrategy(
+ schema={
+ "name": "article",
+ "baseSelector": "div",
+ "fields": [{
+ "name": "title",
+ "selector": "h1",
+ "type": "text"
+ }]
+ }
+ ),
+ chunking_strategy=RegexChunking(),
+ content_filter=PruningContentFilter()
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ 'https://example.com/article',
+ config=crawler_config
+ )
+ assert result.extracted_content is not None, "Content extraction failed"
+ assert 'title' in result.extracted_content, "Missing expected content field"
+
+# Category 3: Cache and Session Management Tests
+async def test_cache_and_session_management():
+ """Test different cache modes and session handling"""
+ browser_config = BrowserConfig(use_persistent_context=True)
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.WRITE_ONLY,
+ process_iframes=True,
+ remove_overlay_elements=True
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ # First request - should write to cache
+ result1 = await crawler.arun(
+ 'https://example.com',
+ config=crawler_config
+ )
+
+ # Second request - should use fresh fetch due to WRITE_ONLY mode
+ result2 = await crawler.arun(
+ 'https://example.com',
+ config=crawler_config
+ )
+
+ assert result1.success and result2.success, "Cache mode crawl failed"
+ assert result1.html == result2.html, "Inconsistent results between requests"
+
+# Category 4: Media Handling Tests
+async def test_media_handling_config():
+ """Test configurations related to media handling"""
+ # Get the base path for home directroy ~/.crawl4ai/downloads, make sure it exists
+ os.makedirs(os.path.expanduser("~/.crawl4ai/downloads"), exist_ok=True)
+ browser_config = BrowserConfig(
+ viewport_width=1920,
+ viewport_height=1080,
+ accept_downloads=True,
+ downloads_path= os.path.expanduser("~/.crawl4ai/downloads")
+ )
+ crawler_config = CrawlerRunConfig(
+ screenshot=True,
+ pdf=True,
+ adjust_viewport_to_content=True,
+ wait_for_images=True,
+ screenshot_height_threshold=20000
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ 'https://example.com',
+ config=crawler_config
+ )
+ assert result.screenshot is not None, "Screenshot capture failed"
+ assert result.pdf is not None, "PDF generation failed"
+
+# Category 5: Anti-Bot and Site Interaction Tests
+async def test_antibot_config():
+ """Test configurations for handling anti-bot measures"""
+ crawler_config = CrawlerRunConfig(
+ simulate_user=True,
+ override_navigator=True,
+ magic=True,
+ wait_for="js:()=>document.querySelector('body')",
+ delay_before_return_html=1.0,
+ log_console=True,
+ cache_mode=CacheMode.BYPASS
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ 'https://example.com',
+ config=crawler_config
+ )
+ assert result.success, "Anti-bot measure handling failed"
+
+# Category 6: Parallel Processing Tests
+async def test_parallel_processing():
+ """Test parallel processing capabilities"""
+ crawler_config = CrawlerRunConfig(
+ mean_delay=0.5,
+ max_range=1.0,
+ semaphore_count=5
+ )
+
+ urls = [
+ 'https://example.com/1',
+ 'https://example.com/2',
+ 'https://example.com/3'
+ ]
+
+ async with AsyncWebCrawler() as crawler:
+ results = await crawler.arun_many(
+ urls,
+ config=crawler_config
+ )
+ assert len(results) == len(urls), "Not all URLs were processed"
+ assert all(r.success for r in results), "Some parallel requests failed"
+
+# Category 7: Backwards Compatibility Tests
+async def test_legacy_parameter_support():
+ """Test that legacy parameters still work"""
+ async with AsyncWebCrawler(
+ headless=True,
+ browser_type="chromium",
+ viewport_width=1024,
+ viewport_height=768
+ ) as crawler:
+ result = await crawler.arun(
+ 'https://example.com',
+ screenshot=True,
+ word_count_threshold=200,
+ bypass_cache=True,
+ css_selector=".main-content"
+ )
+ assert result.success, "Legacy parameter support failed"
+
+# Category 8: Mixed Configuration Tests
+async def test_mixed_config_usage():
+ """Test mixing new config objects with legacy parameters"""
+ browser_config = BrowserConfig(headless=True)
+ crawler_config = CrawlerRunConfig(screenshot=True)
+
+ async with AsyncWebCrawler(
+ config=browser_config,
+ verbose=True # legacy parameter
+ ) as crawler:
+ result = await crawler.arun(
+ 'https://example.com',
+ config=crawler_config,
+ cache_mode=CacheMode.BYPASS, # legacy parameter
+ css_selector="body" # legacy parameter
+ )
+ assert result.success, "Mixed configuration usage failed"
+
+if __name__ == "__main__":
+ async def run_tests():
+ test_functions = [
+ test_browser_config_object,
+ # test_browser_performance_config,
+ # test_content_extraction_config,
+ # test_cache_and_session_management,
+ # test_media_handling_config,
+ # test_antibot_config,
+ # test_parallel_processing,
+ # test_legacy_parameter_support,
+ # test_mixed_config_usage
+ ]
+
+ for test in test_functions:
+ print(f"\nRunning {test.__name__}...")
+ try:
+ await test()
+ print(f"✓ {test.__name__} passed")
+ except AssertionError as e:
+ print(f"✗ {test.__name__} failed: {str(e)}")
+ except Exception as e:
+ print(f"✗ {test.__name__} error: {str(e)}")
+
+ asyncio.run(run_tests())
\ No newline at end of file
diff --git a/tests/async/test_async_doanloader.py b/tests/async/test_async_doanloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..4798b4ca8e9879486f2e200c279234bcf3a286bc
--- /dev/null
+++ b/tests/async/test_async_doanloader.py
@@ -0,0 +1,229 @@
+import os
+import sys
+import asyncio
+import shutil
+from typing import List
+import tempfile
+import time
+
+# Add the parent directory to the Python path
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+
+class TestDownloads:
+ def __init__(self):
+ self.temp_dir = tempfile.mkdtemp(prefix="crawl4ai_test_")
+ self.download_dir = os.path.join(self.temp_dir, "downloads")
+ os.makedirs(self.download_dir, exist_ok=True)
+ self.results: List[str] = []
+
+ def cleanup(self):
+ shutil.rmtree(self.temp_dir)
+
+ def log_result(self, test_name: str, success: bool, message: str = ""):
+ result = f"{'✅' if success else '❌'} {test_name}: {message}"
+ self.results.append(result)
+ print(result)
+
+ async def test_basic_download(self):
+ """Test basic file download functionality"""
+ try:
+ async with AsyncWebCrawler(
+ accept_downloads=True,
+ downloads_path=self.download_dir,
+ verbose=True
+ ) as crawler:
+ # Python.org downloads page typically has stable download links
+ result = await crawler.arun(
+ url="https://www.python.org/downloads/",
+ js_code="""
+ // Click first download link
+ const downloadLink = document.querySelector('a[href$=".exe"]');
+ if (downloadLink) downloadLink.click();
+ """
+ )
+
+ success = result.downloaded_files is not None and len(result.downloaded_files) > 0
+ self.log_result(
+ "Basic Download",
+ success,
+ f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded"
+ )
+ except Exception as e:
+ self.log_result("Basic Download", False, str(e))
+
+ async def test_persistent_context_download(self):
+ """Test downloads with persistent context"""
+ try:
+ user_data_dir = os.path.join(self.temp_dir, "user_data")
+ os.makedirs(user_data_dir, exist_ok=True)
+
+ async with AsyncWebCrawler(
+ accept_downloads=True,
+ downloads_path=self.download_dir,
+ use_persistent_context=True,
+ user_data_dir=user_data_dir,
+ verbose=True
+ ) as crawler:
+ result = await crawler.arun(
+ url="https://www.python.org/downloads/",
+ js_code="""
+ const downloadLink = document.querySelector('a[href$=".exe"]');
+ if (downloadLink) downloadLink.click();
+ """
+ )
+
+ success = result.downloaded_files is not None and len(result.downloaded_files) > 0
+ self.log_result(
+ "Persistent Context Download",
+ success,
+ f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded"
+ )
+ except Exception as e:
+ self.log_result("Persistent Context Download", False, str(e))
+
+ async def test_multiple_downloads(self):
+ """Test multiple simultaneous downloads"""
+ try:
+ async with AsyncWebCrawler(
+ accept_downloads=True,
+ downloads_path=self.download_dir,
+ verbose=True
+ ) as crawler:
+ result = await crawler.arun(
+ url="https://www.python.org/downloads/",
+ js_code="""
+ // Click multiple download links
+ const downloadLinks = document.querySelectorAll('a[href$=".exe"]');
+ downloadLinks.forEach(link => link.click());
+ """
+ )
+
+ success = result.downloaded_files is not None and len(result.downloaded_files) > 1
+ self.log_result(
+ "Multiple Downloads",
+ success,
+ f"Downloaded {len(result.downloaded_files or [])} files" if success else "Not enough files downloaded"
+ )
+ except Exception as e:
+ self.log_result("Multiple Downloads", False, str(e))
+
+ async def test_different_browsers(self):
+ """Test downloads across different browser types"""
+ browsers = ["chromium", "firefox", "webkit"]
+
+ for browser_type in browsers:
+ try:
+ async with AsyncWebCrawler(
+ accept_downloads=True,
+ downloads_path=self.download_dir,
+ browser_type=browser_type,
+ verbose=True
+ ) as crawler:
+ result = await crawler.arun(
+ url="https://www.python.org/downloads/",
+ js_code="""
+ const downloadLink = document.querySelector('a[href$=".exe"]');
+ if (downloadLink) downloadLink.click();
+ """
+ )
+
+ success = result.downloaded_files is not None and len(result.downloaded_files) > 0
+ self.log_result(
+ f"{browser_type.title()} Download",
+ success,
+ f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded"
+ )
+ except Exception as e:
+ self.log_result(f"{browser_type.title()} Download", False, str(e))
+
+ async def test_edge_cases(self):
+ """Test various edge cases"""
+
+ # Test 1: Downloads without specifying download path
+ try:
+ async with AsyncWebCrawler(
+ accept_downloads=True,
+ verbose=True
+ ) as crawler:
+ result = await crawler.arun(
+ url="https://www.python.org/downloads/",
+ js_code="document.querySelector('a[href$=\".exe\"]').click()"
+ )
+ self.log_result(
+ "Default Download Path",
+ True,
+ f"Downloaded to default path: {result.downloaded_files[0] if result.downloaded_files else 'None'}"
+ )
+ except Exception as e:
+ self.log_result("Default Download Path", False, str(e))
+
+ # Test 2: Downloads with invalid path
+ try:
+ async with AsyncWebCrawler(
+ accept_downloads=True,
+ downloads_path="/invalid/path/that/doesnt/exist",
+ verbose=True
+ ) as crawler:
+ result = await crawler.arun(
+ url="https://www.python.org/downloads/",
+ js_code="document.querySelector('a[href$=\".exe\"]').click()"
+ )
+ self.log_result("Invalid Download Path", False, "Should have raised an error")
+ except Exception as e:
+ self.log_result("Invalid Download Path", True, "Correctly handled invalid path")
+
+ # Test 3: Download with accept_downloads=False
+ try:
+ async with AsyncWebCrawler(
+ accept_downloads=False,
+ verbose=True
+ ) as crawler:
+ result = await crawler.arun(
+ url="https://www.python.org/downloads/",
+ js_code="document.querySelector('a[href$=\".exe\"]').click()"
+ )
+ success = result.downloaded_files is None
+ self.log_result(
+ "Disabled Downloads",
+ success,
+ "Correctly ignored downloads" if success else "Unexpectedly downloaded files"
+ )
+ except Exception as e:
+ self.log_result("Disabled Downloads", False, str(e))
+
+ async def run_all_tests(self):
+ """Run all test cases"""
+ print("\n🧪 Running Download Tests...\n")
+
+ test_methods = [
+ self.test_basic_download,
+ self.test_persistent_context_download,
+ self.test_multiple_downloads,
+ self.test_different_browsers,
+ self.test_edge_cases
+ ]
+
+ for test in test_methods:
+ print(f"\n📝 Running {test.__doc__}...")
+ await test()
+ await asyncio.sleep(2) # Brief pause between tests
+
+ print("\n📊 Test Results Summary:")
+ for result in self.results:
+ print(result)
+
+ successes = len([r for r in self.results if '✅' in r])
+ total = len(self.results)
+ print(f"\nTotal: {successes}/{total} tests passed")
+
+ self.cleanup()
+
+async def main():
+ tester = TestDownloads()
+ await tester.run_all_tests()
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/tests/async/test_basic_crawling.py b/tests/async/test_basic_crawling.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce38ac2ff19ad0b857506480f49ca3dd2cb7fe18
--- /dev/null
+++ b/tests/async/test_basic_crawling.py
@@ -0,0 +1,81 @@
+import os
+import sys
+import pytest
+import asyncio
+import time
+
+# Add the parent directory to the Python path
+parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(parent_dir)
+
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+
+@pytest.mark.asyncio
+async def test_successful_crawl():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://www.nbcnews.com/business"
+ result = await crawler.arun(url=url, bypass_cache=True)
+ assert result.success
+ assert result.url == url
+ assert result.html
+ assert result.markdown
+ assert result.cleaned_html
+
+@pytest.mark.asyncio
+async def test_invalid_url():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://www.invalidurl12345.com"
+ result = await crawler.arun(url=url, bypass_cache=True)
+ assert not result.success
+ assert result.error_message
+
+@pytest.mark.asyncio
+async def test_multiple_urls():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ urls = [
+ "https://www.nbcnews.com/business",
+ "https://www.example.com",
+ "https://www.python.org"
+ ]
+ results = await crawler.arun_many(urls=urls, bypass_cache=True)
+ assert len(results) == len(urls)
+ assert all(result.success for result in results)
+ assert all(result.html for result in results)
+
+@pytest.mark.asyncio
+async def test_javascript_execution():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ js_code = "document.body.innerHTML = 'Modified by JS ';"
+ url = "https://www.example.com"
+ result = await crawler.arun(url=url, bypass_cache=True, js_code=js_code)
+ assert result.success
+ assert "Modified by JS " in result.html
+
+@pytest.mark.asyncio
+async def test_concurrent_crawling_performance():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ urls = [
+ "https://www.nbcnews.com/business",
+ "https://www.example.com",
+ "https://www.python.org",
+ "https://www.github.com",
+ "https://www.stackoverflow.com"
+ ]
+
+ start_time = time.time()
+ results = await crawler.arun_many(urls=urls, bypass_cache=True)
+ end_time = time.time()
+
+ total_time = end_time - start_time
+ print(f"Total time for concurrent crawling: {total_time:.2f} seconds")
+
+ assert all(result.success for result in results)
+ assert len(results) == len(urls)
+
+ # Assert that concurrent crawling is faster than sequential
+ # This multiplier may need adjustment based on the number of URLs and their complexity
+ assert total_time < len(urls) * 5, f"Concurrent crawling not significantly faster: {total_time:.2f} seconds"
+
+# Entry point for debugging
+if __name__ == "__main__":
+ pytest.main([__file__, "-v"])
\ No newline at end of file
diff --git a/tests/async/test_caching.py b/tests/async/test_caching.py
new file mode 100644
index 0000000000000000000000000000000000000000..589beca98a5306e1be2cd5ce354049ba4010f76d
--- /dev/null
+++ b/tests/async/test_caching.py
@@ -0,0 +1,82 @@
+import os
+import sys
+import pytest
+import asyncio
+
+# Add the parent directory to the Python path
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+
+@pytest.mark.asyncio
+async def test_caching():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://www.nbcnews.com/business"
+
+ # First crawl (should not use cache)
+ start_time = asyncio.get_event_loop().time()
+ result1 = await crawler.arun(url=url, bypass_cache=True)
+ end_time = asyncio.get_event_loop().time()
+ time_taken1 = end_time - start_time
+
+ assert result1.success
+
+ # Second crawl (should use cache)
+ start_time = asyncio.get_event_loop().time()
+ result2 = await crawler.arun(url=url, bypass_cache=False)
+ end_time = asyncio.get_event_loop().time()
+ time_taken2 = end_time - start_time
+
+ assert result2.success
+ assert time_taken2 < time_taken1 # Cached result should be faster
+
+@pytest.mark.asyncio
+async def test_bypass_cache():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://www.nbcnews.com/business"
+
+ # First crawl
+ result1 = await crawler.arun(url=url, bypass_cache=False)
+ assert result1.success
+
+ # Second crawl with bypass_cache=True
+ result2 = await crawler.arun(url=url, bypass_cache=True)
+ assert result2.success
+
+ # Content should be different (or at least, not guaranteed to be the same)
+ assert result1.html != result2.html or result1.markdown != result2.markdown
+
+@pytest.mark.asyncio
+async def test_clear_cache():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://www.nbcnews.com/business"
+
+ # Crawl and cache
+ await crawler.arun(url=url, bypass_cache=False)
+
+ # Clear cache
+ await crawler.aclear_cache()
+
+ # Check cache size
+ cache_size = await crawler.aget_cache_size()
+ assert cache_size == 0
+
+@pytest.mark.asyncio
+async def test_flush_cache():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://www.nbcnews.com/business"
+
+ # Crawl and cache
+ await crawler.arun(url=url, bypass_cache=False)
+
+ # Flush cache
+ await crawler.aflush_cache()
+
+ # Check cache size
+ cache_size = await crawler.aget_cache_size()
+ assert cache_size == 0
+
+# Entry point for debugging
+if __name__ == "__main__":
+ pytest.main([__file__, "-v"])
\ No newline at end of file
diff --git a/tests/async/test_chunking_and_extraction_strategies.py b/tests/async/test_chunking_and_extraction_strategies.py
new file mode 100644
index 0000000000000000000000000000000000000000..af1c9fbdadbc5ac167a73fc1a3fce0b662664404
--- /dev/null
+++ b/tests/async/test_chunking_and_extraction_strategies.py
@@ -0,0 +1,87 @@
+import os
+import sys
+import pytest
+import asyncio
+import json
+
+# Add the parent directory to the Python path
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+from crawl4ai.chunking_strategy import RegexChunking, NlpSentenceChunking
+from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy
+
+@pytest.mark.asyncio
+async def test_regex_chunking():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://www.nbcnews.com/business"
+ chunking_strategy = RegexChunking(patterns=["\n\n"])
+ result = await crawler.arun(
+ url=url,
+ chunking_strategy=chunking_strategy,
+ bypass_cache=True
+ )
+ assert result.success
+ assert result.extracted_content
+ chunks = json.loads(result.extracted_content)
+ assert len(chunks) > 1 # Ensure multiple chunks were created
+
+# @pytest.mark.asyncio
+# async def test_cosine_strategy():
+# async with AsyncWebCrawler(verbose=True) as crawler:
+# url = "https://www.nbcnews.com/business"
+# extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
+# result = await crawler.arun(
+# url=url,
+# extraction_strategy=extraction_strategy,
+# bypass_cache=True
+# )
+# assert result.success
+# assert result.extracted_content
+# extracted_data = json.loads(result.extracted_content)
+# assert len(extracted_data) > 0
+# assert all('tags' in item for item in extracted_data)
+
+@pytest.mark.asyncio
+async def test_llm_extraction_strategy():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://www.nbcnews.com/business"
+ extraction_strategy = LLMExtractionStrategy(
+ provider="openai/gpt-4o-mini",
+ api_token=os.getenv('OPENAI_API_KEY'),
+ instruction="Extract only content related to technology"
+ )
+ result = await crawler.arun(
+ url=url,
+ extraction_strategy=extraction_strategy,
+ bypass_cache=True
+ )
+ assert result.success
+ assert result.extracted_content
+ extracted_data = json.loads(result.extracted_content)
+ assert len(extracted_data) > 0
+ assert all('content' in item for item in extracted_data)
+
+# @pytest.mark.asyncio
+# async def test_combined_chunking_and_extraction():
+# async with AsyncWebCrawler(verbose=True) as crawler:
+# url = "https://www.nbcnews.com/business"
+# chunking_strategy = RegexChunking(patterns=["\n\n"])
+# extraction_strategy = CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold=0.3)
+# result = await crawler.arun(
+# url=url,
+# chunking_strategy=chunking_strategy,
+# extraction_strategy=extraction_strategy,
+# bypass_cache=True
+# )
+# assert result.success
+# assert result.extracted_content
+# extracted_data = json.loads(result.extracted_content)
+# assert len(extracted_data) > 0
+# assert all('tags' in item for item in extracted_data)
+# assert all('content' in item for item in extracted_data)
+
+# Entry point for debugging
+if __name__ == "__main__":
+ pytest.main([__file__, "-v"])
\ No newline at end of file
diff --git a/tests/async/test_content_extraction.py b/tests/async/test_content_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..7604db203ac9a6e46ce4be567962a5c72bfdeb49
--- /dev/null
+++ b/tests/async/test_content_extraction.py
@@ -0,0 +1,90 @@
+import os
+import sys
+import pytest
+import asyncio
+import json
+
+# Add the parent directory to the Python path
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+
+@pytest.mark.asyncio
+async def test_extract_markdown():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://www.nbcnews.com/business"
+ result = await crawler.arun(url=url, bypass_cache=True)
+ assert result.success
+ assert result.markdown
+ assert isinstance(result.markdown, str)
+ assert len(result.markdown) > 0
+
+@pytest.mark.asyncio
+async def test_extract_cleaned_html():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://www.nbcnews.com/business"
+ result = await crawler.arun(url=url, bypass_cache=True)
+ assert result.success
+ assert result.cleaned_html
+ assert isinstance(result.cleaned_html, str)
+ assert len(result.cleaned_html) > 0
+
+@pytest.mark.asyncio
+async def test_extract_media():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://www.nbcnews.com/business"
+ result = await crawler.arun(url=url, bypass_cache=True)
+ assert result.success
+ assert result.media
+ media = result.media
+ assert isinstance(media, dict)
+ assert "images" in media
+ assert isinstance(media["images"], list)
+ for image in media["images"]:
+ assert "src" in image
+ assert "alt" in image
+ assert "type" in image
+
+@pytest.mark.asyncio
+async def test_extract_links():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://www.nbcnews.com/business"
+ result = await crawler.arun(url=url, bypass_cache=True)
+ assert result.success
+ assert result.links
+ links = result.links
+ assert isinstance(links, dict)
+ assert "internal" in links
+ assert "external" in links
+ assert isinstance(links["internal"], list)
+ assert isinstance(links["external"], list)
+ for link in links["internal"] + links["external"]:
+ assert "href" in link
+ assert "text" in link
+
+@pytest.mark.asyncio
+async def test_extract_metadata():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://www.nbcnews.com/business"
+ result = await crawler.arun(url=url, bypass_cache=True)
+ assert result.success
+ assert result.metadata
+ metadata = result.metadata
+ assert isinstance(metadata, dict)
+ assert "title" in metadata
+ assert isinstance(metadata["title"], str)
+
+@pytest.mark.asyncio
+async def test_css_selector_extraction():
+ async with AsyncWebCrawler(verbose=True) as crawler:
+ url = "https://www.nbcnews.com/business"
+ css_selector = "h1, h2, h3"
+ result = await crawler.arun(url=url, bypass_cache=True, css_selector=css_selector)
+ assert result.success
+ assert result.markdown
+ assert all(heading in result.markdown for heading in ["#", "##", "###"])
+
+# Entry point for debugging
+if __name__ == "__main__":
+ pytest.main([__file__, "-v"])
\ No newline at end of file
diff --git a/tests/async/test_content_filter_bm25.py b/tests/async/test_content_filter_bm25.py
new file mode 100644
index 0000000000000000000000000000000000000000..a873c414a0436875a90b2670ee81a9025bf54b08
--- /dev/null
+++ b/tests/async/test_content_filter_bm25.py
@@ -0,0 +1,175 @@
+import os, sys
+import pytest
+from bs4 import BeautifulSoup
+from typing import List
+
+# Add the parent directory to the Python path
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+
+from crawl4ai.content_filter_strategy import BM25ContentFilter
+
+@pytest.fixture
+def basic_html():
+ return """
+
+
+ Test Article
+
+
+
+
+ Main Heading
+
+ This is a long paragraph with more than fifty words. It continues with more text to ensure we meet the minimum word count threshold. We need to make sure this paragraph is substantial enough to be considered for extraction according to our filtering rules. This should be enough words now.
+ Skip this nav content
+
+
+
+ """
+
+@pytest.fixture
+def wiki_html():
+ return """
+
+
+ Wikipedia Article
+
+
+ Article Title
+ Section 1
+ Short but important section header description.
+
+
Long paragraph with sufficient words to meet the minimum threshold. This paragraph continues with more text to ensure we have enough content for proper testing. We need to make sure this has enough words to pass our filters and be considered valid content for extraction purposes.
+
+
+
+ """
+
+@pytest.fixture
+def no_meta_html():
+ return """
+
+
+ Simple Page
+ First paragraph that should be used as fallback for query when no meta tags exist. This text needs to be long enough to serve as a meaningful fallback for our content extraction process.
+
+
+ """
+
+class TestBM25ContentFilter:
+ def test_basic_extraction(self, basic_html):
+ """Test basic content extraction functionality"""
+ filter = BM25ContentFilter()
+ contents = filter.filter_content(basic_html)
+
+ assert contents, "Should extract content"
+ assert len(contents) >= 1, "Should extract at least one content block"
+ assert "long paragraph" in ' '.join(contents).lower()
+ assert "navigation" not in ' '.join(contents).lower()
+
+ def test_user_query_override(self, basic_html):
+ """Test that user query overrides metadata extraction"""
+ user_query = "specific test query"
+ filter = BM25ContentFilter(user_query=user_query)
+
+ # Access internal state to verify query usage
+ soup = BeautifulSoup(basic_html, 'lxml')
+ extracted_query = filter.extract_page_query(soup.find('head'))
+
+ assert extracted_query == user_query
+ assert "Test description" not in extracted_query
+
+ def test_header_extraction(self, wiki_html):
+ """Test that headers are properly extracted despite length"""
+ filter = BM25ContentFilter()
+ contents = filter.filter_content(wiki_html)
+
+ combined_content = ' '.join(contents).lower()
+ assert "section 1" in combined_content, "Should include section header"
+ assert "article title" in combined_content, "Should include main title"
+
+ def test_no_metadata_fallback(self, no_meta_html):
+ """Test fallback behavior when no metadata is present"""
+ filter = BM25ContentFilter()
+ contents = filter.filter_content(no_meta_html)
+
+ assert contents, "Should extract content even without metadata"
+ assert "First paragraph" in ' '.join(contents), "Should use first paragraph content"
+
+ def test_empty_input(self):
+ """Test handling of empty input"""
+ filter = BM25ContentFilter()
+ assert filter.filter_content("") == []
+ assert filter.filter_content(None) == []
+
+ def test_malformed_html(self):
+ """Test handling of malformed HTML"""
+ malformed_html = "Unclosed paragraph
Nested content
"
+ filter = BM25ContentFilter()
+ contents = filter.filter_content(malformed_html)
+
+ assert isinstance(contents, list), "Should return list even with malformed HTML"
+
+ def test_threshold_behavior(self, basic_html):
+ """Test different BM25 threshold values"""
+ strict_filter = BM25ContentFilter(bm25_threshold=2.0)
+ lenient_filter = BM25ContentFilter(bm25_threshold=0.5)
+
+ strict_contents = strict_filter.filter_content(basic_html)
+ lenient_contents = lenient_filter.filter_content(basic_html)
+
+ assert len(strict_contents) <= len(lenient_contents), \
+ "Strict threshold should extract fewer elements"
+
+ def test_html_cleaning(self, basic_html):
+ """Test HTML cleaning functionality"""
+ filter = BM25ContentFilter()
+ contents = filter.filter_content(basic_html)
+
+ cleaned_content = ' '.join(contents)
+ assert 'class=' not in cleaned_content, "Should remove class attributes"
+ assert 'style=' not in cleaned_content, "Should remove style attributes"
+ assert '