Coverage for src/extratools_html/__init__.py: 41%

68 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-05-12 00:00 -0700

1from __future__ import annotations 

2 

3import asyncio 

4from collections.abc import Iterable 

5from contextlib import suppress 

6from enum import StrEnum 

7from http import HTTPStatus 

8from typing import Any 

9 

10import backoff 

11import httpx 

12import truststore 

13from html2text import HTML2Text 

14 

15with suppress(ImportError): 

16 from playwright.async_api import Browser, async_playwright, expect 

17 from playwright.async_api import TimeoutError as PlaywrightTimeoutError 

18 

19from .cleanup import cleanup_page 

20 

21truststore.inject_into_ssl() 

22 

23MAX_TRIES: int = 3 

24MAX_TIMEOUT: int = 60 

25REQUEST_TIMEOUT: int = 10 

26# In milliseconds 

27PRE_ACTION_TIMEOUT: int = 10 * 1_000 

28 

29 

30class PageElementAction(StrEnum): 

31 CLICK = "click" 

32 TO_BE_VISIBLE = "to_be_visible" 

33 

34 

35async def __download_via_request( 

36 page_url: str, 

37 *, 

38 user_agent: str | None = None, 

39) -> str | None: 

40 async with httpx.AsyncClient() as client: 

41 response: httpx.Response = await client.get( 

42 page_url, 

43 follow_redirects=True, 

44 timeout=REQUEST_TIMEOUT, 

45 headers=( 

46 { 

47 "User-Agent": user_agent, 

48 } if user_agent 

49 else {} 

50 ), 

51 ) 

52 

53 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS: 

54 # It also triggers backoff if necessary 

55 return None 

56 

57 response.raise_for_status() 

58 

59 return response.text 

60 

61 

62async def __download_via_browser( 

63 page_url: str, 

64 *, 

65 user_agent: str | None = None, 

66 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None, 

67) -> str | None: 

68 async with async_playwright() as playwright: 

69 browser: Browser = await playwright.chromium.launch() 

70 await browser.new_context( 

71 user_agent=user_agent, 

72 ) 

73 

74 page = await browser.new_page() 

75 await page.route( 

76 "**/*", 

77 lambda route: ( 

78 route.abort() 

79 # https://playwright.dev/python/docs/api/class-request#request-resource-type 

80 if route.request.resource_type in { 

81 "font", 

82 "image", 

83 "media", 

84 } 

85 else route.continue_() 

86 ), 

87 ) 

88 response = await page.goto(page_url) 

89 if not response: 

90 return None 

91 if response.status == HTTPStatus.TOO_MANY_REQUESTS: 

92 # It also triggers backoff if necessary 

93 return None 

94 

95 for selector, action in pre_actions or []: 

96 with suppress(AssertionError, PlaywrightTimeoutError): 

97 match action: 

98 case PageElementAction.CLICK: 

99 await page.locator(selector).click( 

100 timeout=PRE_ACTION_TIMEOUT, 

101 # Allow click even current element is covered by other elements. 

102 # Otherwise, other pre-actions are needed before this pre-action 

103 # to dismiss those covering elements. 

104 # However, it is possible that dismissing those covering elements 

105 # is necessary logic for page to function properly. 

106 force=True, 

107 ) 

108 case PageElementAction.TO_BE_VISIBLE: 

109 await expect(page.locator(selector)).to_be_visible( 

110 timeout=PRE_ACTION_TIMEOUT, 

111 ) 

112 

113 html: str = await page.content() 

114 

115 await browser.close() 

116 

117 return html 

118 

119 

120@backoff.on_predicate( 

121 backoff.expo, 

122 max_tries=MAX_TRIES, 

123 max_time=MAX_TIMEOUT, 

124) 

125async def download_page_async( 

126 page_url: str, 

127 *, 

128 cleanup: bool = False, 

129 text_only: bool = False, 

130 user_agent: str | None = None, 

131 use_browser: bool = False, 

132 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None, 

133) -> str | None: 

134 page_html: str | None 

135 if use_browser: 

136 page_html = await __download_via_browser( 

137 page_url, 

138 user_agent=user_agent, 

139 pre_actions=pre_actions, 

140 ) 

141 else: 

142 page_html = await __download_via_request( 

143 page_url, 

144 user_agent=user_agent, 

145 ) 

146 if page_html is None: 

147 return None 

148 

149 if cleanup: 

150 page_html = await cleanup_page(page_html) 

151 

152 if text_only: 

153 h = HTML2Text() 

154 h.ignore_images = True 

155 h.ignore_links = True 

156 return h.handle(page_html) 

157 

158 return page_html 

159 

160 

161def download_page( 

162 image_url: str, 

163 **kwargs: Any, 

164) -> str | None: 

165 return asyncio.run(download_page_async( 

166 image_url, 

167 **kwargs, 

168 ))