Source code for intelliscraper.common.models

"""Pydantic data models for IntelliScraper.

Defines the core data structures used throughout the library:

- ``RequestEvent`` / ``SessionStats`` — time-series request tracking.
- ``Session`` — browser session with cookies, storage, and fingerprint.
- ``Proxy`` — proxy server configuration.
- ``ScrapeRequest`` — input parameters for a scrape operation.
- ``ScrapeResponse`` — output of a scrape operation with enriched
  metadata.
"""

from __future__ import annotations

from collections import Counter
from datetime import timedelta
from threading import Lock

from pydantic import BaseModel, Field, PrivateAttr

from intelliscraper.enums import BrowsingMode, ScrapStatus


[docs] class RequestEvent(BaseModel): """A single scraping request event in time-series format. Each event captures when a request was made and its outcome, enabling audit trails and performance analysis. Attributes: sent_at: Unix timestamp when this request was sent. request_status: Outcome status of the scraping request. """ sent_at: float = Field(description="Unix timestamp when this request was sent") request_status: ScrapStatus = Field( description="Outcome status of the scraping request" )
[docs] class SessionStats(BaseModel): """Thread-safe statistics collector for scraping sessions. Maintains a time-series log of all request events and provides computed statistics about success rates, failures, and performance. All operations are thread-safe via an internal ``Lock``. Attributes: request_events: Chronological list of all request events. """ model_config = {"arbitrary_types_allowed": True} request_events: list[RequestEvent] = Field( default_factory=list, description="Chronological list of all request events", ) _lock: Lock = PrivateAttr(default_factory=Lock)
[docs] def add_request_event(self, request_event: RequestEvent) -> None: """Add a request event to the log in a thread-safe manner. Args: request_event: The ``RequestEvent`` to record. """ with self._lock: self.request_events.append(request_event)
@property def stats(self) -> dict[str, int]: """Get a breakdown of all request statuses. Returns: Dictionary mapping status names to counts, e.g.:: {"success": 42, "partial_success": 3, "failed": 1, ...} """ with self._lock: status_counts = Counter( event.request_status.value for event in self.request_events ) return { status.value: status_counts.get(status.value, 0) for status in ScrapStatus }
[docs] class Session(BaseModel): """Browser session data for authenticated scraping. Captures all state needed to resume an authenticated browser session: cookies, localStorage, sessionStorage, and a browser fingerprint for anti-detection. Attributes: site: Identifier for the target site (e.g. ``"linkedin"``). base_url: The base URL used for scraping. cookies: List of cookie dicts captured from the session. localStorage: Key-value pairs from the browser's localStorage. sessionStorage: Key-value pairs from the browser's sessionStorage. fingerprint: Browser fingerprint data for anti-detection. stats: Time-series event log and computed statistics. """ site: str = Field(description="Identifier of the target site (e.g. 'linkedin')") base_url: str = Field(description="The base URL used for scraping") cookies: list[dict] = Field( default_factory=list, description="List of cookies captured from the session", ) localStorage: dict | None = Field( default=None, description="Key-value pairs from browser's localStorage", ) sessionStorage: dict | None = Field( default=None, description="Key-value pairs from browser's sessionStorage", ) fingerprint: dict | None = Field( default=None, description="Browser fingerprint data for anti-detection", ) stats: SessionStats = Field( default_factory=SessionStats, description="Time-series event log and computed statistics", )
[docs] class Proxy(BaseModel): """Proxy configuration for network requests. Applied at the browser-context level in **managed browser mode only**. All pages within a scraper instance share the same proxy. Not used in local browser mode — the user's Chrome instance manages its own network configuration. Attributes: server: Proxy server URL (e.g. ``http://myproxy.com:3128``). bypass: Comma-separated domains to bypass the proxy. username: Proxy authentication username. password: Proxy authentication password. """ server: str = Field( description=( "Proxy server URL or host:port. Supports HTTP and SOCKS " "schemes (e.g. 'http://myproxy.com:3128', " "'socks5://myproxy.com:1080')." ), ) bypass: str | None = Field( default=None, description=( "Comma-separated domains to bypass the proxy " "(e.g. '.example.com,localhost')." ), ) username: str | None = Field( default=None, description="Username for proxy authentication", ) password: str | None = Field( default=None, description="Password for proxy authentication", )
[docs] class ScrapeRequest(BaseModel): """Input configuration for a single scraping request. Captures all parameters used to initiate a scrape, enabling full traceability from request to response. Attributes: url: The target URL to scrape. timeout: Maximum time allowed for page load. browser_launch_options: Options used to launch the browser. proxy: Proxy configuration used, if any. session_data: Session information used, if any. browsing_mode: Browser behaviour mode (FAST or HUMAN_LIKE). """ url: str = Field(description="The target URL to scrape") timeout: timedelta = Field(description="Maximum time allowed for page load") browser_launch_options: dict | None = Field( default=None, description="Options used to launch the browser", ) proxy: Proxy | None = Field( default=None, description="Proxy configuration used during the scrape", ) session_data: Session | None = Field( default=None, description="Session information (cookies, storage, auth data)", ) browsing_mode: BrowsingMode | None = Field( default=None, description="Browser behaviour mode (FAST or HUMAN_LIKE)", )
[docs] class ScrapeResponse(BaseModel): """Output of a web scraping operation with enriched metadata. Contains the scraped content, timing information, HTTP status, and metadata about which session and browser mode were used. Attributes: scrape_request: The original request parameters. status: Final outcome status of the scrape. http_status_code: Actual HTTP status code returned by the server (e.g. 200, 403, 429). ``None`` if the request failed before receiving a response. elapsed_time: Total scrape duration in seconds. scrap_html_content: Raw HTML content from the page. error_msg: Error message if the scrape failed. session_id: Identifier of the session used (the ``site`` field from ``Session``), or ``None`` if no session. browser_mode: Which browser backend was used: ``"local_browser"`` or ``"managed_browser"``. """ scrape_request: ScrapeRequest = Field(description="The original request parameters") status: ScrapStatus = Field(description="Final outcome status of the scrape") http_status_code: int | None = Field( default=None, description="HTTP status code from the server (e.g. 200, 403, 429)", ) elapsed_time: float | None = Field( default=None, description="Total scrape duration in seconds", ) scrap_html_content: str | None = Field( default=None, description="Raw HTML content from the target page", ) error_msg: str | None = Field( default=None, description="Error message if scraping failed; None on success", ) session_id: str | None = Field( default=None, description="Session site identifier used for this scrape", ) browser_mode: str | None = Field( default=None, description="Browser backend: 'local_browser' or 'managed_browser'", )