Coverage for src/extratools_html/__init__.py: 41%
68 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-12 00:00 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-12 00:00 -0700
1from __future__ import annotations
3import asyncio
4from collections.abc import Iterable
5from contextlib import suppress
6from enum import StrEnum
7from http import HTTPStatus
8from typing import Any
10import backoff
11import httpx
12import truststore
13from html2text import HTML2Text
15with suppress(ImportError):
16 from playwright.async_api import Browser, async_playwright, expect
17 from playwright.async_api import TimeoutError as PlaywrightTimeoutError
19from .cleanup import cleanup_page
21truststore.inject_into_ssl()
23MAX_TRIES: int = 3
24MAX_TIMEOUT: int = 60
25REQUEST_TIMEOUT: int = 10
26# In milliseconds
27PRE_ACTION_TIMEOUT: int = 10 * 1_000
30class PageElementAction(StrEnum):
31 CLICK = "click"
32 TO_BE_VISIBLE = "to_be_visible"
35async def __download_via_request(
36 page_url: str,
37 *,
38 user_agent: str | None = None,
39) -> str | None:
40 async with httpx.AsyncClient() as client:
41 response: httpx.Response = await client.get(
42 page_url,
43 follow_redirects=True,
44 timeout=REQUEST_TIMEOUT,
45 headers=(
46 {
47 "User-Agent": user_agent,
48 } if user_agent
49 else {}
50 ),
51 )
53 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
54 # It also triggers backoff if necessary
55 return None
57 response.raise_for_status()
59 return response.text
62async def __download_via_browser(
63 page_url: str,
64 *,
65 user_agent: str | None = None,
66 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None,
67) -> str | None:
68 async with async_playwright() as playwright:
69 browser: Browser = await playwright.chromium.launch()
70 await browser.new_context(
71 user_agent=user_agent,
72 )
74 page = await browser.new_page()
75 await page.route(
76 "**/*",
77 lambda route: (
78 route.abort()
79 # https://playwright.dev/python/docs/api/class-request#request-resource-type
80 if route.request.resource_type in {
81 "font",
82 "image",
83 "media",
84 }
85 else route.continue_()
86 ),
87 )
88 response = await page.goto(page_url)
89 if not response:
90 return None
91 if response.status == HTTPStatus.TOO_MANY_REQUESTS:
92 # It also triggers backoff if necessary
93 return None
95 for selector, action in pre_actions or []:
96 with suppress(AssertionError, PlaywrightTimeoutError):
97 match action:
98 case PageElementAction.CLICK:
99 await page.locator(selector).click(
100 timeout=PRE_ACTION_TIMEOUT,
101 # Allow click even current element is covered by other elements.
102 # Otherwise, other pre-actions are needed before this pre-action
103 # to dismiss those covering elements.
104 # However, it is possible that dismissing those covering elements
105 # is necessary logic for page to function properly.
106 force=True,
107 )
108 case PageElementAction.TO_BE_VISIBLE:
109 await expect(page.locator(selector)).to_be_visible(
110 timeout=PRE_ACTION_TIMEOUT,
111 )
113 html: str = await page.content()
115 await browser.close()
117 return html
120@backoff.on_predicate(
121 backoff.expo,
122 max_tries=MAX_TRIES,
123 max_time=MAX_TIMEOUT,
124)
125async def download_page_async(
126 page_url: str,
127 *,
128 cleanup: bool = False,
129 text_only: bool = False,
130 user_agent: str | None = None,
131 use_browser: bool = False,
132 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None,
133) -> str | None:
134 page_html: str | None
135 if use_browser:
136 page_html = await __download_via_browser(
137 page_url,
138 user_agent=user_agent,
139 pre_actions=pre_actions,
140 )
141 else:
142 page_html = await __download_via_request(
143 page_url,
144 user_agent=user_agent,
145 )
146 if page_html is None:
147 return None
149 if cleanup:
150 page_html = await cleanup_page(page_html)
152 if text_only:
153 h = HTML2Text()
154 h.ignore_images = True
155 h.ignore_links = True
156 return h.handle(page_html)
158 return page_html
161def download_page(
162 image_url: str,
163 **kwargs: Any,
164) -> str | None:
165 return asyncio.run(download_page_async(
166 image_url,
167 **kwargs,
168 ))