Source code for intelliscraper.common.models

"""Pydantic data models for IntelliScraper.

Defines the core data structures used throughout the library:

- ``RequestEvent`` / ``SessionStats`` — time-series request tracking.
- ``Session`` — browser session with cookies, storage, and fingerprint.
- ``Proxy`` — proxy server configuration.
- ``ScrapeRequest`` — input parameters for a scrape operation.
- ``ScrapeResponse`` — output of a scrape operation with enriched
  metadata.
"""

from __future__ import annotations

from collections import Counter
from datetime import timedelta
from threading import Lock

from pydantic import BaseModel, Field, PrivateAttr

from intelliscraper.enums import BrowsingMode, ScrapStatus



[docs]
class RequestEvent(BaseModel):
    """A single scraping request event in time-series format.

    Each event captures when a request was made and its outcome,
    enabling audit trails and performance analysis.

    Attributes:
        sent_at: Unix timestamp when this request was sent.
        request_status: Outcome status of the scraping request.
    """

    sent_at: float = Field(description="Unix timestamp when this request was sent")
    request_status: ScrapStatus = Field(
        description="Outcome status of the scraping request"
    )




[docs]
class SessionStats(BaseModel):
    """Thread-safe statistics collector for scraping sessions.

    Maintains a time-series log of all request events and provides
    computed statistics about success rates, failures, and performance.
    All operations are thread-safe via an internal ``Lock``.

    Attributes:
        request_events: Chronological list of all request events.
    """

    model_config = {"arbitrary_types_allowed": True}

    request_events: list[RequestEvent] = Field(
        default_factory=list,
        description="Chronological list of all request events",
    )

    _lock: Lock = PrivateAttr(default_factory=Lock)


[docs]
    def add_request_event(self, request_event: RequestEvent) -> None:
        """Add a request event to the log in a thread-safe manner.

        Args:
            request_event: The ``RequestEvent`` to record.
        """
        with self._lock:
            self.request_events.append(request_event)


    @property
    def stats(self) -> dict[str, int]:
        """Get a breakdown of all request statuses.

        Returns:
            Dictionary mapping status names to counts, e.g.::

                {"success": 42, "partial_success": 3, "failed": 1, ...}
        """
        with self._lock:
            status_counts = Counter(
                event.request_status.value for event in self.request_events
            )
            return {
                status.value: status_counts.get(status.value, 0)
                for status in ScrapStatus
            }




[docs]
class Session(BaseModel):
    """Browser session data for authenticated scraping.

    Captures all state needed to resume an authenticated browser
    session: cookies, localStorage, sessionStorage, and a browser
    fingerprint for anti-detection.

    Attributes:
        site: Identifier for the target site (e.g. ``"linkedin"``).
        base_url: The base URL used for scraping.
        cookies: List of cookie dicts captured from the session.
        localStorage: Key-value pairs from the browser's localStorage.
        sessionStorage: Key-value pairs from the browser's
            sessionStorage.
        fingerprint: Browser fingerprint data for anti-detection.
        stats: Time-series event log and computed statistics.
    """

    site: str = Field(description="Identifier of the target site (e.g. 'linkedin')")
    base_url: str = Field(description="The base URL used for scraping")
    cookies: list[dict] = Field(
        default_factory=list,
        description="List of cookies captured from the session",
    )
    localStorage: dict | None = Field(
        default=None,
        description="Key-value pairs from browser's localStorage",
    )
    sessionStorage: dict | None = Field(
        default=None,
        description="Key-value pairs from browser's sessionStorage",
    )
    fingerprint: dict | None = Field(
        default=None,
        description="Browser fingerprint data for anti-detection",
    )
    stats: SessionStats = Field(
        default_factory=SessionStats,
        description="Time-series event log and computed statistics",
    )




[docs]
class Proxy(BaseModel):
    """Proxy configuration for network requests.

    Applied at the browser-context level in **managed browser mode
    only**.  All pages within a scraper instance share the same proxy.

    Not used in local browser mode — the user's Chrome instance
    manages its own network configuration.

    Attributes:
        server: Proxy server URL (e.g.
            ``http://myproxy.com:3128``).
        bypass: Comma-separated domains to bypass the proxy.
        username: Proxy authentication username.
        password: Proxy authentication password.
    """

    server: str = Field(
        description=(
            "Proxy server URL or host:port.  Supports HTTP and SOCKS "
            "schemes (e.g. 'http://myproxy.com:3128', "
            "'socks5://myproxy.com:1080')."
        ),
    )
    bypass: str | None = Field(
        default=None,
        description=(
            "Comma-separated domains to bypass the proxy "
            "(e.g. '.example.com,localhost')."
        ),
    )
    username: str | None = Field(
        default=None,
        description="Username for proxy authentication",
    )
    password: str | None = Field(
        default=None,
        description="Password for proxy authentication",
    )




[docs]
class ScrapeRequest(BaseModel):
    """Input configuration for a single scraping request.

    Captures all parameters used to initiate a scrape, enabling
    full traceability from request to response.

    Attributes:
        url: The target URL to scrape.
        timeout: Maximum time allowed for page load.
        browser_launch_options: Options used to launch the browser.
        proxy: Proxy configuration used, if any.
        session_data: Session information used, if any.
        browsing_mode: Browser behaviour mode (FAST or HUMAN_LIKE).
    """

    url: str = Field(description="The target URL to scrape")
    timeout: timedelta = Field(description="Maximum time allowed for page load")
    browser_launch_options: dict | None = Field(
        default=None,
        description="Options used to launch the browser",
    )
    proxy: Proxy | None = Field(
        default=None,
        description="Proxy configuration used during the scrape",
    )
    session_data: Session | None = Field(
        default=None,
        description="Session information (cookies, storage, auth data)",
    )
    browsing_mode: BrowsingMode | None = Field(
        default=None,
        description="Browser behaviour mode (FAST or HUMAN_LIKE)",
    )




[docs]
class ScrapeResponse(BaseModel):
    """Output of a web scraping operation with enriched metadata.

    Contains the scraped content, timing information, HTTP status,
    and metadata about which session and browser mode were used.

    Attributes:
        scrape_request: The original request parameters.
        status: Final outcome status of the scrape.
        http_status_code: Actual HTTP status code returned by the
            server (e.g. 200, 403, 429).  ``None`` if the request
            failed before receiving a response.
        elapsed_time: Total scrape duration in seconds.
        scrap_html_content: Raw HTML content from the page.
        error_msg: Error message if the scrape failed.
        session_id: Identifier of the session used (the ``site``
            field from ``Session``), or ``None`` if no session.
        browser_mode: Which browser backend was used:
            ``"local_browser"`` or ``"managed_browser"``.
    """

    scrape_request: ScrapeRequest = Field(description="The original request parameters")
    status: ScrapStatus = Field(description="Final outcome status of the scrape")
    http_status_code: int | None = Field(
        default=None,
        description="HTTP status code from the server (e.g. 200, 403, 429)",
    )
    elapsed_time: float | None = Field(
        default=None,
        description="Total scrape duration in seconds",
    )
    scrap_html_content: str | None = Field(
        default=None,
        description="Raw HTML content from the target page",
    )
    error_msg: str | None = Field(
        default=None,
        description="Error message if scraping failed; None on success",
    )
    session_id: str | None = Field(
        default=None,
        description="Session site identifier used for this scrape",
    )
    browser_mode: str | None = Field(
        default=None,
        description="Browser backend: 'local_browser' or 'managed_browser'",
    )