Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import re 

2import os 

3import six 

4import json 

5import time 

6import shlex 

7import atexit 

8import psutil 

9import requests 

10import tornado.gen 

11from orderedattrdict import AttrDict 

12from threading import Thread, Lock 

13from subprocess import Popen, PIPE, STDOUT # nosec 

14from six.moves.urllib.parse import urlencode, urljoin 

15from tornado.web import HTTPError 

16from tornado.httpclient import AsyncHTTPClient 

17from gramex.config import app_log, variables, recursive_encode 

18from gramex.http import OK, BAD_REQUEST, GATEWAY_TIMEOUT, BAD_GATEWAY, CLIENT_TIMEOUT 

19from .basehandler import BaseHandler 

20 

21_PPTX_MIME = 'application/vnd.openxmlformats-officedocument.presentationml.presentation' 

22# HTTP headers not to forward to chromecapture.js. 

23# Keep this sync-ed with the same list in chromecapture.js 

24_IGNORE_HEADERS = { 

25 'host', # The URL will determine the host 

26 'connection', # Let Tornado manage the connection 

27 'upgrade', # .. and the upgrades 

28 'content-length', # The new request will have a different content - length 

29 'content-md5', # ... and different content - md5 

30} 

31 

32 

33class Capture(object): 

34 default_port = 9900 # Default port to run CaptureJS at 

35 check_interval = 0.05 # Frequency (seconds) to check if self.started 

36 # Set engine configurations for PhantomJS and Puppeteer 

37 engines = AttrDict( 

38 phantomjs=AttrDict( 

39 cmd='phantomjs --ssl-protocol=any', 

40 script='capture.js', 

41 first_line=b'PhantomJS.*capture\\.js', 

42 name='Capture', 

43 version='1.0' 

44 ), 

45 chrome=AttrDict( 

46 cmd='node', 

47 script='chromecapture.js', 

48 first_line=b'node\\.js.*chromecapture\\.js', 

49 name='ChromeCapture', 

50 version='1.1' 

51 ), 

52 ) 

53 

54 ''' 

55 Create a proxy for capture.js. Typical usage:: 

56 

57 capture = Capture() 

58 with open('screenshot.png', 'wb') as handle: 

59 handle.write(capture.png('https://gramener.com/')) 

60 with open('screenshot.pdf', 'wb') as handle: 

61 handle.write(capture.pdf('https://gramener.com/')) 

62 

63 The constructor accepts these optional parameters: 

64 

65 :arg int port: port where capture.js is running. Default: 9900 

66 :arg string url: URL:port where PhantomJS is running with capture.js. 

67 Default: ``http://localhost:<port>/`` 

68 :arg string cmd: Command to run PhantomJS with capture.js at the specified 

69 port. Default: ``phantomjs $GRAMEXPATH/apps/capture/capture.js --port=<port>`` 

70 :arg int timeout: Seconds to wait for PhantomJS to timeout. Default: 10 

71 

72 The constructor runs :meth:`Capture.start` in a new thread, which checks if 

73 capture.js is running at ``url``. If not, it runs ``cmd`` and checks again. 

74 Until capture.js is detected, all capture methods will fail. 

75 ''' 

76 def __init__(self, port=None, url=None, engine=None, cmd=None, timeout=10): 

77 # Set default values for port, url and cmd 

78 self.engine = self.engines['phantomjs' if engine is None else engine] 

79 port = self.default_port if port is None else port 

80 if url is None: 80 ↛ 85line 80 didn't jump to line 85, because the condition on line 80 was never false

81 url = 'http://localhost:%d/' % port 

82 if cmd is None: 82 ↛ 85line 82 didn't jump to line 85, because the condition on line 82 was never false

83 script = os.path.join(variables.GRAMEXPATH, 'apps', 'capture', self.engine.script) 

84 cmd = '%s "%s" --port=%d' % (self.engine.cmd, script, port) 

85 self.url = url 

86 self.first_line_re = re.compile(self.engine.first_line) 

87 self.cmd = cmd 

88 self.timeout = timeout 

89 self.browser = AsyncHTTPClient() 

90 self.lock = Lock() 

91 self.started = False 

92 self.start() 

93 

94 def start(self): 

95 ''' 

96 Starts a thread and check if capture is already running at ``url``. If 

97 not, start ``cmd`` and check again. Print logs from ``cmd``. 

98 

99 This method is thread-safe. It may be called as often as required. 

100 :class:`CaptureHandler` calls this method if ``?start`` is passed. 

101 ''' 

102 with self.lock: 

103 thread = Thread(target=self._start) 

104 thread.daemon = True 

105 thread.start() 

106 

107 def _start(self): 

108 ''' 

109 Check if capture is already running at ``url``. If not, start ``cmd`` 

110 and check again. Print logs from ``cmd``. 

111 ''' 

112 self.started = False 

113 script = self.engine.script 

114 try: 

115 # Check if capture.js is at the url specified 

116 app_log.info('Pinging %s at %s', script, self.url) 

117 r = requests.get(self.url, timeout=self.timeout) 

118 self._validate_server(r) 

119 self.started = True 

120 except requests.ReadTimeout: 120 ↛ 122line 120 didn't jump to line 122, because the exception caught by line 120 didn't happen

121 # If capture.js doesn't respond immediately, we haven't started 

122 app_log.error('url: %s timed out', self.url) 

123 except requests.ConnectionError: 123 ↛ 154line 123 didn't jump to line 154

124 # Try starting the process again 

125 app_log.info('Starting %s via %s', script, self.cmd) 

126 self.close() 

127 # self.cmd is taken from the YAML configuration. Safe to run 

128 self.proc = Popen(shlex.split(self.cmd), stdout=PIPE, stderr=STDOUT) # nosec 

129 self.proc.poll() 

130 atexit.register(self.close) 

131 # TODO: what if readline() does not return quickly? 

132 line = self.proc.stdout.readline().strip() 

133 if not self.first_line_re.search(line): 133 ↛ 134line 133 didn't jump to line 134, because the condition on line 133 was never true

134 return app_log.error('cmd: %s invalid. Returned "%s"', self.cmd, line) 

135 app_log.info('Pinging %s at %s', script, self.url) 

136 try: 

137 r = requests.get(self.url, timeout=self.timeout) 

138 self._validate_server(r) 

139 pid = self.proc.pid 

140 app_log.info(line.decode('utf-8') + ' live (pid=%s)', pid) 

141 self.started = True 

142 # Keep logging capture.js output until proc is killed by another thread 

143 while hasattr(self, 'proc'): 143 ↛ exitline 143 didn't return from function '_start', because the condition on line 143 was never false

144 line = self.proc.stdout.readline().strip() 

145 if len(line) == 0: 

146 app_log.info('%s terminated: pid=%d', script, pid) 

147 self.started = False 

148 break 

149 # Capture won't print anything, unless there's a problem, or if debug is on. 

150 # So log it at warning level not info. 

151 app_log.warning(line.decode('utf-8')) 

152 except Exception: 

153 app_log.exception('Ran %s. But %s not at %s', self.cmd, script, self.url) 

154 except Exception: 

155 app_log.exception('Cannot start Capture') 

156 

157 def close(self): 

158 '''Stop capture.js if it has been started by this object''' 

159 if hasattr(self, 'proc'): 159 ↛ 160line 159 didn't jump to line 160, because the condition on line 159 was never true

160 try: 

161 process = psutil.Process(self.proc.pid) 

162 for proc in process.children(recursive=True): 

163 proc.kill() 

164 process.kill() 

165 except psutil.NoSuchProcess: 

166 app_log.info('%s PID %d already killed', self.engine.script, self.proc.pid) 

167 pass 

168 delattr(self, 'proc') 

169 

170 def _validate_server(self, response): 

171 # Make sure that the response we got is from the right version of capture.js 

172 server = response.headers.get('Server', '') 

173 parts = server.split('/', 2) 

174 script = self.engine.script 

175 if not len(parts) == 2 or parts[0] != self.engine.name or parts[1] < self.engine.version: 175 ↛ 176line 175 didn't jump to line 176, because the condition on line 175 was never true

176 raise RuntimeError('Server: %s at %s is not %s' % (server, self.url, script)) 

177 

178 @tornado.gen.coroutine 

179 def capture_async(self, headers=None, **kwargs): 

180 ''' 

181 Returns a screenshot of the URL. Runs asynchronously in Gramex. Arguments 

182 are same as :py:func:`capture` 

183 ''' 

184 # If ?start is provided, start server and wait until timeout 

185 if 'start' in kwargs: 

186 self.start() 

187 end_time = time.time() + self.timeout 

188 while not self.started and time.time() < end_time: 

189 yield tornado.gen.sleep(self.check_interval) 

190 if not self.started: 

191 raise RuntimeError('%s not started. See logs' % self.engine.script) 

192 if six.PY2: 

193 recursive_encode(kwargs) 

194 r = yield self.browser.fetch( 

195 self.url, method='POST', body=urlencode(kwargs, doseq=True), raise_error=False, 

196 connect_timeout=self.timeout, request_timeout=self.timeout, headers=headers) 

197 if r.code == OK: 

198 self._validate_server(r) 

199 raise tornado.gen.Return(r) 

200 

201 def capture(self, url, **kwargs): 

202 ''' 

203 Return a screenshot of the URL. 

204 

205 :arg str url: URL to take a screenshot of 

206 :arg str ext: format of output. Can be pdf, png, gif or jpg 

207 :arg str selector: Restrict screenshot to (optional) CSS selector in URL 

208 :arg int delay: milliseconds (or expression) to wait for before taking a screenshot 

209 :arg str format: A3, A4, A5, Legal, Letter or Tabloid. Defaults to A4. For PDF 

210 :arg str layout: A3, A4, A5, Legal, 16x9, 16x10, 4x3. Defaults to 4x3. For PPTX 

211 :arg str orientation: portrait or landscape. Defaults to portrait. For PDF 

212 :arg str header: header for the page. For PDF 

213 :arg str footer: footer for the page. For PDF 

214 :arg int width: screen width. Default: 1200. For PNG/GIF/JPG 

215 :arg int height: screen height. Default: 768. For PNG/GIF/JPG 

216 :arg float scale: zooms the screen by a factor. For PNG/GIF/JPG 

217 :arg int dpi: dots (pixels) per inch. For PPTX 

218 :arg str title: slide title. For PPTX 

219 :arg int debug: sets log level for HTTP requests (2) and responses (1) 

220 :return: a bytestring with the binary contents of the screenshot 

221 :rtype: bytes 

222 :raises RuntimeError: if capture.js is not running or fails 

223 ''' 

224 # Ensure that we're connecting to the right version of capture.js 

225 if not self.started: 

226 end_time = time.time() + self.timeout 

227 while not self.started and time.time() < end_time: 

228 time.sleep(self.check_interval) 

229 if not self.started: 

230 raise RuntimeError('%s not started. See logs' % self.engine.script) 

231 kwargs['url'] = url 

232 r = requests.post(self.url, data=kwargs, timeout=self.timeout) 

233 if r.status_code == OK: 

234 self._validate_server(r) 

235 return r.content 

236 else: 

237 raise RuntimeError('%s error: %s' % (self.engine.script, r.content)) 

238 

239 def pdf(self, url, **kwargs): 

240 '''An alias for :meth:`Capture.capture` with ``ext='pdf'``.''' 

241 kwargs['ext'] = 'pdf' 

242 return self.capture(url, **kwargs) 

243 

244 def png(self, url, **kwargs): 

245 '''An alias for :meth:`Capture.capture` with ``ext='png'``.''' 

246 kwargs['ext'] = 'png' 

247 return self.capture(url, **kwargs) 

248 

249 def pptx(self, url, **kwargs): 

250 '''An alias for :meth:`Capture.capture` with ``ext='pptx'``.''' 

251 kwargs['ext'] = 'pptx' 

252 return self.capture(url, **kwargs) 

253 

254 def jpg(self, url, **kwargs): 

255 '''An alias for :meth:`Capture.capture` with ``ext='jpg'``.''' 

256 kwargs['ext'] = 'jpg' 

257 return self.capture(url, **kwargs) 

258 

259 def gif(self, url, **kwargs): 

260 '''An alias for :meth:`Capture.capture` with ``ext='gif'``.''' 

261 kwargs['ext'] = 'gif' 

262 return self.capture(url, **kwargs) 

263 

264 

265class CaptureHandler(BaseHandler): 

266 ''' 

267 Renders a web page as a PDF or as an image. It accepts the same arguments as 

268 :class:`Capture`. 

269 

270 The page is called with the same args as :meth:`Capture.capture`. It also 

271 accepts a ``?start`` parameter that restarts capture.js if required. 

272 ''' 

273 # Each config maps to a Capture() object. cls.captures[config] = Capture() 

274 captures = {} 

275 

276 @classmethod 

277 def setup(cls, port=None, url=None, engine=None, cmd=None, **kwargs): 

278 super(CaptureHandler, cls).setup(**kwargs) 

279 capture_kwargs = {} 

280 for kwarg in ('timeout', ): 

281 if kwarg in kwargs: 

282 capture_kwargs[kwarg] = kwargs.pop(kwarg) 

283 # Create a new Capture only if the config has changed 

284 config = dict(engine=engine, port=port, url=url, cmd=cmd, **capture_kwargs) 

285 config_str = json.dumps(config, separators=[',', ':'], sort_keys=True) 

286 if config_str not in cls.captures: 286 ↛ 289line 286 didn't jump to line 289, because the condition on line 286 was never false

287 cls.captures[config_str] = cls.capture = Capture(**config) 

288 else: 

289 cls.capture = cls.captures[config_str] 

290 # TODO: if the old config is no longer used, close it 

291 cls.ext = { 

292 'pdf': dict(mime='application/pdf'), 

293 'png': dict(mime='image/png'), 

294 'jpg': dict(mime='image/jpeg'), 

295 'jpeg': dict(mime='image/jpeg'), 

296 'gif': dict(mime='image/gif'), 

297 'pptx': dict(mime=_PPTX_MIME), 

298 } 

299 

300 @tornado.gen.coroutine 

301 def get(self): 

302 args = self.argparse( 

303 url={'default': self.request.headers.get('Referer', None)}, 

304 ext={'choices': self.ext, 'default': 'pdf'}, 

305 file={'default': 'screenshot'}, 

306 emulate={}, 

307 selector={'nargs': '*'}, 

308 cookie={}, 

309 delay={}, 

310 width={'type': int}, 

311 height={'type': int}, 

312 x={'type': int}, 

313 y={'type': int}, 

314 scale={'type': float}, 

315 dpi={'type': int, 'nargs': '*'}, 

316 format={'choices': ['A3', 'A4', 'A5', 'Legal', 'Letter', 'Tabloid'], 'default': 'A4'}, 

317 layout={'choices': ['A3', 'A4', 'Letter', '16x9', '16x10', '4x3'], 'default': '4x3'}, 

318 orientation={'choices': ['portrait', 'landscape'], 'default': 'portrait'}, 

319 title={'nargs': '*'}, 

320 title_size={'type': int, 'nargs': '*'}, 

321 start={'nargs': '*'}, 

322 debug={'nargs': '*'}, 

323 header={}, 

324 footer={}, 

325 headerTemplate={}, 

326 footerTemplate={}, 

327 margins={}, 

328 ) 

329 if args['url'] is None: 

330 raise HTTPError(BAD_REQUEST, reason='%s: CaptureHandler needs ?url=' % self.name) 

331 

332 # If the URL is a relative URL, treat it relative to the called path 

333 args['url'] = urljoin(self.request.full_url(), args['url']) 

334 # Copy all relevant HTTP headers as-is 

335 args['headers'] = { 

336 key: val for key, val in self.request.headers.items() 

337 if key not in _IGNORE_HEADERS 

338 } 

339 if 'cookie' not in args: 

340 cookie = self.request.headers.get('Cookie', None) 

341 if cookie is not None: 

342 args['cookie'] = cookie 

343 info = self.ext[args.ext] 

344 try: 

345 response = yield self.capture.capture_async(**args) 

346 except RuntimeError as e: 

347 # capture.js could not fetch the response 

348 raise HTTPError(BAD_GATEWAY, reason=e.args[0]) 

349 

350 if response.code == OK: 

351 self.set_header('Content-Type', info['mime']) 

352 self.set_header('Content-Disposition', 

353 'attachment; filename="{file}.{ext}"'.format(**args)) 

354 self.write(response.body) 

355 elif response.code == CLIENT_TIMEOUT: 

356 self.set_status(GATEWAY_TIMEOUT, reason='Capture is busy') 

357 self.set_header('Content-Type', 'application/json') 

358 self.write({'status': 'fail', 'msg': [ 

359 'Capture did not respond within timeout: %ds' % self.capture.timeout]}) 

360 else: 

361 self.set_status(response.code, reason='capture.js error') 

362 self.set_header('Content-Type', 'application/json') 

363 self.write(response.body)