Source code for intelliscraper.parsers.html_parser

"""General-purpose HTML parser.

Parses raw HTML content and provides access to plain text, links,
Markdown, and LLM-optimised Markdown (with boilerplate stripped).

This is the default parser used by IntelliScraper and the recommended
base class for site-specific parsers.

Example::

    parser = HTMLParser(url="https://example.com", html=html_string)
    print(parser.text)               # plain text
    print(parser.links)              # list of absolute URLs
    print(parser.markdown)           # full Markdown
    print(parser.markdown_for_llm)   # cleaned Markdown for LLM input
    print(parser.navigable_links)    # classified internal/external links
"""

from __future__ import annotations

from functools import cached_property

from bs4 import BeautifulSoup
from html_to_markdown import LinkType
from html_to_markdown.api import convert as _convert
from html_to_markdown.options import PreprocessingOptions

from intelliscraper.enums import HTMLParserType
from intelliscraper.exception import HTMLParserInputError
from intelliscraper.parsers.base_parser import BaseParser
from intelliscraper.utils import normalize_links

# ---------------------------------------------------------------------------
# html-to-markdown v3.x exposes ConversionOptions as an immutable Rust type
# with a broken Python constructor.  The public api.py wrapper accepts any
# object with the right string attributes, so we use a plain class instead.
# ---------------------------------------------------------------------------


class _ConversionOpts:
    """Duck-typed ``ConversionOptions`` for ``html_to_markdown.api.convert``."""

    def __init__(
        self,
        preprocessing: PreprocessingOptions | None = None,
        extract_metadata: bool = True,
    ) -> None:
        self.heading_style = "atx"
        self.list_indent_type = "spaces"
        self.list_indent_width = 2
        self.bullets = "-*+"
        self.strong_em_symbol = "*"
        self.escape_asterisks = False
        self.escape_underscores = False
        self.escape_misc = False
        self.escape_ascii = False
        self.code_language = ""
        self.autolinks = True
        self.default_title = False
        self.br_in_tables = False
        self.highlight_style = "double_equal"
        self.extract_metadata = extract_metadata
        self.whitespace_mode = "normalized"
        self.strip_newlines = False
        self.wrap = False
        self.wrap_width = 80
        self.convert_as_inline = False
        self.sub_symbol = ""
        self.sup_symbol = ""
        self.newline_style = "spaces"
        self.code_block_style = "backticks"
        self.keep_inline_images_in = []
        self.preprocessing = preprocessing
        self.encoding = "utf-8"
        self.debug = False
        self.strip_tags = []
        self.preserve_tags = []
        self.skip_images = False
        self.link_style = "inline"
        self.output_format = "markdown"
        self.include_document_structure = False
        self.extract_images = False
        self.max_image_size = 5_242_880
        self.capture_svg = False
        self.infer_dimensions = True
        self.max_depth = None
        self.exclude_selectors = []


_OPTS_STANDARD = _ConversionOpts(
    preprocessing=PreprocessingOptions(
        enabled=True,
        preset="standard",
        remove_navigation=False,
        remove_forms=False,
    ),
    extract_metadata=True,
)

_OPTS_LLM = _ConversionOpts(
    preprocessing=PreprocessingOptions(
        enabled=True,
        preset="aggressive",
        remove_navigation=True,
        remove_forms=True,
    ),
    extract_metadata=False,
)



[docs]
class HTMLParser(BaseParser):
    """General-purpose HTML parser with text, link, and Markdown extraction.

    Wraps BeautifulSoup for DOM querying and ``html-to-markdown`` for
    Markdown conversion.  Provides both standard and LLM-optimised
    Markdown outputs.

    All properties are lazily computed and cached on first access.

    Args:
        url: The source URL of the page (used for link normalisation).
        html: Raw HTML string.  Must be a non-empty string.
        html_parser_type: BeautifulSoup parser backend.  Defaults to
            ``HTMLParserType.HTML5LIB``.

    Raises:
        HTMLParserInputError: If ``html`` is empty or not a string.

    Example::

        parser = HTMLParser(
            url="https://example.com/page",
            html=response.scrap_html_content,
        )
        print(parser.text)
        print(parser.links)
        print(parser.markdown_for_llm)
    """

    def __init__(
        self,
        url: str,
        html: str,
        html_parser_type: HTMLParserType = HTMLParserType.HTML5LIB,
    ) -> None:
        if not (html and isinstance(html, str)):
            raise HTMLParserInputError(
                "HTMLParser expects a non-empty string as HTML input."
            )
        self.url = url
        self.html = html
        self.soup = BeautifulSoup(html, html_parser_type.value)

    @cached_property
    def _conversion_result(self):
        """Single conversion shared by ``markdown`` and ``navigable_links``."""
        return _convert(self.html, _OPTS_STANDARD)

    @cached_property
    def text(self) -> str:
        """Plain text extracted from the HTML.

        Uses BeautifulSoup's ``get_text()`` with newline separators and
        whitespace stripping.
        """
        return self.soup.get_text(separator="\n", strip=True)

    @cached_property
    def links(self) -> list[str]:
        """All normalised ``href`` values from ``<a>`` tags.

        Relative URLs are resolved against the source URL.  Duplicates
        and fragment-only links are removed.

        Returns:
            A deduplicated list of absolute HTTP/HTTPS URLs.
        """
        all_links = [a.get("href") for a in self.soup.find_all("a") if a.get("href")]
        return normalize_links(base_url=self.url, links=all_links)

    @cached_property
    def navigable_links(self) -> list[dict]:
        """Internal and external page links, classified and normalised.

        Skips anchors (``#fragment``), ``mailto:``, ``tel:``,
        ``javascript:``, and resource links (CSS/JS).

        Returns:
            A list of dicts, each with keys:

            - ``href`` — absolute URL
            - ``text`` — visible link label
            - ``title`` — title attribute or ``None``
            - ``link_type`` — ``"Internal"`` or ``"External"``
            - ``rel`` — list of rel values (e.g. ``["nofollow"]``)
        """
        raw_links = self._conversion_result.metadata.links or []
        result = []
        for link in raw_links:
            if link.link_type not in (
                LinkType.Internal,
                LinkType.External,
            ):
                continue
            href = (link.href or "").strip()
            if not href:
                continue
            normalised = normalize_links(base_url=self.url, links=[href])
            if not normalised:
                continue
            result.append(
                {
                    "href": normalised[0],
                    "text": (link.text or "").strip(),
                    "title": link.title,
                    "link_type": str(link.link_type).split(".")[-1],
                    "rel": list(link.rel or []),
                }
            )
        return result

    @cached_property
    def markdown(self) -> str:
        """Full-page Markdown with standard preprocessing.

        Preserves navigation, forms, and page structure.
        """
        return self._conversion_result.content or ""

    @cached_property
    def markdown_for_llm(self) -> str:
        """Markdown with nav, ads, forms, and boilerplate stripped.

        Optimised for use as LLM input — removes elements that add
        noise without informational value.
        """
        return _convert(self.html, _OPTS_LLM).content or ""