Coverage for gramex\handlers\capturehandler.py : 48%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import re
2import os
3import six
4import json
5import time
6import shlex
7import atexit
8import psutil
9import requests
10import tornado.gen
11from orderedattrdict import AttrDict
12from threading import Thread, Lock
13from subprocess import Popen, PIPE, STDOUT # nosec
14from six.moves.urllib.parse import urlencode, urljoin
15from tornado.web import HTTPError
16from tornado.httpclient import AsyncHTTPClient
17from gramex.config import app_log, variables, recursive_encode
18from gramex.http import OK, BAD_REQUEST, GATEWAY_TIMEOUT, BAD_GATEWAY, CLIENT_TIMEOUT
19from .basehandler import BaseHandler
21_PPTX_MIME = 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
22# HTTP headers not to forward to chromecapture.js.
23# Keep this sync-ed with the same list in chromecapture.js
24_IGNORE_HEADERS = {
25 'host', # The URL will determine the host
26 'connection', # Let Tornado manage the connection
27 'upgrade', # .. and the upgrades
28 'content-length', # The new request will have a different content - length
29 'content-md5', # ... and different content - md5
30}
33class Capture(object):
34 default_port = 9900 # Default port to run CaptureJS at
35 check_interval = 0.05 # Frequency (seconds) to check if self.started
36 # Set engine configurations for PhantomJS and Puppeteer
37 engines = AttrDict(
38 phantomjs=AttrDict(
39 cmd='phantomjs --ssl-protocol=any',
40 script='capture.js',
41 first_line=b'PhantomJS.*capture\\.js',
42 name='Capture',
43 version='1.0'
44 ),
45 chrome=AttrDict(
46 cmd='node',
47 script='chromecapture.js',
48 first_line=b'node\\.js.*chromecapture\\.js',
49 name='ChromeCapture',
50 version='1.1'
51 ),
52 )
54 '''
55 Create a proxy for capture.js. Typical usage::
57 capture = Capture()
58 with open('screenshot.png', 'wb') as handle:
59 handle.write(capture.png('https://gramener.com/'))
60 with open('screenshot.pdf', 'wb') as handle:
61 handle.write(capture.pdf('https://gramener.com/'))
63 The constructor accepts these optional parameters:
65 :arg int port: port where capture.js is running. Default: 9900
66 :arg string url: URL:port where PhantomJS is running with capture.js.
67 Default: ``http://localhost:<port>/``
68 :arg string cmd: Command to run PhantomJS with capture.js at the specified
69 port. Default: ``phantomjs $GRAMEXPATH/apps/capture/capture.js --port=<port>``
70 :arg int timeout: Seconds to wait for PhantomJS to timeout. Default: 10
72 The constructor runs :meth:`Capture.start` in a new thread, which checks if
73 capture.js is running at ``url``. If not, it runs ``cmd`` and checks again.
74 Until capture.js is detected, all capture methods will fail.
75 '''
76 def __init__(self, port=None, url=None, engine=None, cmd=None, timeout=10):
77 # Set default values for port, url and cmd
78 self.engine = self.engines['phantomjs' if engine is None else engine]
79 port = self.default_port if port is None else port
80 if url is None: 80 ↛ 85line 80 didn't jump to line 85, because the condition on line 80 was never false
81 url = 'http://localhost:%d/' % port
82 if cmd is None: 82 ↛ 85line 82 didn't jump to line 85, because the condition on line 82 was never false
83 script = os.path.join(variables.GRAMEXPATH, 'apps', 'capture', self.engine.script)
84 cmd = '%s "%s" --port=%d' % (self.engine.cmd, script, port)
85 self.url = url
86 self.first_line_re = re.compile(self.engine.first_line)
87 self.cmd = cmd
88 self.timeout = timeout
89 self.browser = AsyncHTTPClient()
90 self.lock = Lock()
91 self.started = False
92 self.start()
94 def start(self):
95 '''
96 Starts a thread and check if capture is already running at ``url``. If
97 not, start ``cmd`` and check again. Print logs from ``cmd``.
99 This method is thread-safe. It may be called as often as required.
100 :class:`CaptureHandler` calls this method if ``?start`` is passed.
101 '''
102 with self.lock:
103 thread = Thread(target=self._start)
104 thread.daemon = True
105 thread.start()
107 def _start(self):
108 '''
109 Check if capture is already running at ``url``. If not, start ``cmd``
110 and check again. Print logs from ``cmd``.
111 '''
112 self.started = False
113 script = self.engine.script
114 try:
115 # Check if capture.js is at the url specified
116 app_log.info('Pinging %s at %s', script, self.url)
117 r = requests.get(self.url, timeout=self.timeout)
118 self._validate_server(r)
119 self.started = True
120 except requests.ReadTimeout: 120 ↛ 122line 120 didn't jump to line 122, because the exception caught by line 120 didn't happen
121 # If capture.js doesn't respond immediately, we haven't started
122 app_log.error('url: %s timed out', self.url)
123 except requests.ConnectionError: 123 ↛ 154line 123 didn't jump to line 154
124 # Try starting the process again
125 app_log.info('Starting %s via %s', script, self.cmd)
126 self.close()
127 # self.cmd is taken from the YAML configuration. Safe to run
128 self.proc = Popen(shlex.split(self.cmd), stdout=PIPE, stderr=STDOUT) # nosec
129 self.proc.poll()
130 atexit.register(self.close)
131 # TODO: what if readline() does not return quickly?
132 line = self.proc.stdout.readline().strip()
133 if not self.first_line_re.search(line): 133 ↛ 134line 133 didn't jump to line 134, because the condition on line 133 was never true
134 return app_log.error('cmd: %s invalid. Returned "%s"', self.cmd, line)
135 app_log.info('Pinging %s at %s', script, self.url)
136 try:
137 r = requests.get(self.url, timeout=self.timeout)
138 self._validate_server(r)
139 pid = self.proc.pid
140 app_log.info(line.decode('utf-8') + ' live (pid=%s)', pid)
141 self.started = True
142 # Keep logging capture.js output until proc is killed by another thread
143 while hasattr(self, 'proc'): 143 ↛ exitline 143 didn't return from function '_start', because the condition on line 143 was never false
144 line = self.proc.stdout.readline().strip()
145 if len(line) == 0:
146 app_log.info('%s terminated: pid=%d', script, pid)
147 self.started = False
148 break
149 # Capture won't print anything, unless there's a problem, or if debug is on.
150 # So log it at warning level not info.
151 app_log.warning(line.decode('utf-8'))
152 except Exception:
153 app_log.exception('Ran %s. But %s not at %s', self.cmd, script, self.url)
154 except Exception:
155 app_log.exception('Cannot start Capture')
157 def close(self):
158 '''Stop capture.js if it has been started by this object'''
159 if hasattr(self, 'proc'): 159 ↛ 160line 159 didn't jump to line 160, because the condition on line 159 was never true
160 try:
161 process = psutil.Process(self.proc.pid)
162 for proc in process.children(recursive=True):
163 proc.kill()
164 process.kill()
165 except psutil.NoSuchProcess:
166 app_log.info('%s PID %d already killed', self.engine.script, self.proc.pid)
167 pass
168 delattr(self, 'proc')
170 def _validate_server(self, response):
171 # Make sure that the response we got is from the right version of capture.js
172 server = response.headers.get('Server', '')
173 parts = server.split('/', 2)
174 script = self.engine.script
175 if not len(parts) == 2 or parts[0] != self.engine.name or parts[1] < self.engine.version: 175 ↛ 176line 175 didn't jump to line 176, because the condition on line 175 was never true
176 raise RuntimeError('Server: %s at %s is not %s' % (server, self.url, script))
178 @tornado.gen.coroutine
179 def capture_async(self, headers=None, **kwargs):
180 '''
181 Returns a screenshot of the URL. Runs asynchronously in Gramex. Arguments
182 are same as :py:func:`capture`
183 '''
184 # If ?start is provided, start server and wait until timeout
185 if 'start' in kwargs:
186 self.start()
187 end_time = time.time() + self.timeout
188 while not self.started and time.time() < end_time:
189 yield tornado.gen.sleep(self.check_interval)
190 if not self.started:
191 raise RuntimeError('%s not started. See logs' % self.engine.script)
192 if six.PY2:
193 recursive_encode(kwargs)
194 r = yield self.browser.fetch(
195 self.url, method='POST', body=urlencode(kwargs, doseq=True), raise_error=False,
196 connect_timeout=self.timeout, request_timeout=self.timeout, headers=headers)
197 if r.code == OK:
198 self._validate_server(r)
199 raise tornado.gen.Return(r)
201 def capture(self, url, **kwargs):
202 '''
203 Return a screenshot of the URL.
205 :arg str url: URL to take a screenshot of
206 :arg str ext: format of output. Can be pdf, png, gif or jpg
207 :arg str selector: Restrict screenshot to (optional) CSS selector in URL
208 :arg int delay: milliseconds (or expression) to wait for before taking a screenshot
209 :arg str format: A3, A4, A5, Legal, Letter or Tabloid. Defaults to A4. For PDF
210 :arg str layout: A3, A4, A5, Legal, 16x9, 16x10, 4x3. Defaults to 4x3. For PPTX
211 :arg str orientation: portrait or landscape. Defaults to portrait. For PDF
212 :arg str header: header for the page. For PDF
213 :arg str footer: footer for the page. For PDF
214 :arg int width: screen width. Default: 1200. For PNG/GIF/JPG
215 :arg int height: screen height. Default: 768. For PNG/GIF/JPG
216 :arg float scale: zooms the screen by a factor. For PNG/GIF/JPG
217 :arg int dpi: dots (pixels) per inch. For PPTX
218 :arg str title: slide title. For PPTX
219 :arg int debug: sets log level for HTTP requests (2) and responses (1)
220 :return: a bytestring with the binary contents of the screenshot
221 :rtype: bytes
222 :raises RuntimeError: if capture.js is not running or fails
223 '''
224 # Ensure that we're connecting to the right version of capture.js
225 if not self.started:
226 end_time = time.time() + self.timeout
227 while not self.started and time.time() < end_time:
228 time.sleep(self.check_interval)
229 if not self.started:
230 raise RuntimeError('%s not started. See logs' % self.engine.script)
231 kwargs['url'] = url
232 r = requests.post(self.url, data=kwargs, timeout=self.timeout)
233 if r.status_code == OK:
234 self._validate_server(r)
235 return r.content
236 else:
237 raise RuntimeError('%s error: %s' % (self.engine.script, r.content))
239 def pdf(self, url, **kwargs):
240 '''An alias for :meth:`Capture.capture` with ``ext='pdf'``.'''
241 kwargs['ext'] = 'pdf'
242 return self.capture(url, **kwargs)
244 def png(self, url, **kwargs):
245 '''An alias for :meth:`Capture.capture` with ``ext='png'``.'''
246 kwargs['ext'] = 'png'
247 return self.capture(url, **kwargs)
249 def pptx(self, url, **kwargs):
250 '''An alias for :meth:`Capture.capture` with ``ext='pptx'``.'''
251 kwargs['ext'] = 'pptx'
252 return self.capture(url, **kwargs)
254 def jpg(self, url, **kwargs):
255 '''An alias for :meth:`Capture.capture` with ``ext='jpg'``.'''
256 kwargs['ext'] = 'jpg'
257 return self.capture(url, **kwargs)
259 def gif(self, url, **kwargs):
260 '''An alias for :meth:`Capture.capture` with ``ext='gif'``.'''
261 kwargs['ext'] = 'gif'
262 return self.capture(url, **kwargs)
265class CaptureHandler(BaseHandler):
266 '''
267 Renders a web page as a PDF or as an image. It accepts the same arguments as
268 :class:`Capture`.
270 The page is called with the same args as :meth:`Capture.capture`. It also
271 accepts a ``?start`` parameter that restarts capture.js if required.
272 '''
273 # Each config maps to a Capture() object. cls.captures[config] = Capture()
274 captures = {}
276 @classmethod
277 def setup(cls, port=None, url=None, engine=None, cmd=None, **kwargs):
278 super(CaptureHandler, cls).setup(**kwargs)
279 capture_kwargs = {}
280 for kwarg in ('timeout', ):
281 if kwarg in kwargs:
282 capture_kwargs[kwarg] = kwargs.pop(kwarg)
283 # Create a new Capture only if the config has changed
284 config = dict(engine=engine, port=port, url=url, cmd=cmd, **capture_kwargs)
285 config_str = json.dumps(config, separators=[',', ':'], sort_keys=True)
286 if config_str not in cls.captures: 286 ↛ 289line 286 didn't jump to line 289, because the condition on line 286 was never false
287 cls.captures[config_str] = cls.capture = Capture(**config)
288 else:
289 cls.capture = cls.captures[config_str]
290 # TODO: if the old config is no longer used, close it
291 cls.ext = {
292 'pdf': dict(mime='application/pdf'),
293 'png': dict(mime='image/png'),
294 'jpg': dict(mime='image/jpeg'),
295 'jpeg': dict(mime='image/jpeg'),
296 'gif': dict(mime='image/gif'),
297 'pptx': dict(mime=_PPTX_MIME),
298 }
300 @tornado.gen.coroutine
301 def get(self):
302 args = self.argparse(
303 url={'default': self.request.headers.get('Referer', None)},
304 ext={'choices': self.ext, 'default': 'pdf'},
305 file={'default': 'screenshot'},
306 emulate={},
307 selector={'nargs': '*'},
308 cookie={},
309 delay={},
310 width={'type': int},
311 height={'type': int},
312 x={'type': int},
313 y={'type': int},
314 scale={'type': float},
315 dpi={'type': int, 'nargs': '*'},
316 format={'choices': ['A3', 'A4', 'A5', 'Legal', 'Letter', 'Tabloid'], 'default': 'A4'},
317 layout={'choices': ['A3', 'A4', 'Letter', '16x9', '16x10', '4x3'], 'default': '4x3'},
318 orientation={'choices': ['portrait', 'landscape'], 'default': 'portrait'},
319 title={'nargs': '*'},
320 title_size={'type': int, 'nargs': '*'},
321 start={'nargs': '*'},
322 debug={'nargs': '*'},
323 header={},
324 footer={},
325 headerTemplate={},
326 footerTemplate={},
327 margins={},
328 )
329 if args['url'] is None:
330 raise HTTPError(BAD_REQUEST, reason='%s: CaptureHandler needs ?url=' % self.name)
332 # If the URL is a relative URL, treat it relative to the called path
333 args['url'] = urljoin(self.request.full_url(), args['url'])
334 # Copy all relevant HTTP headers as-is
335 args['headers'] = {
336 key: val for key, val in self.request.headers.items()
337 if key not in _IGNORE_HEADERS
338 }
339 if 'cookie' not in args:
340 cookie = self.request.headers.get('Cookie', None)
341 if cookie is not None:
342 args['cookie'] = cookie
343 info = self.ext[args.ext]
344 try:
345 response = yield self.capture.capture_async(**args)
346 except RuntimeError as e:
347 # capture.js could not fetch the response
348 raise HTTPError(BAD_GATEWAY, reason=e.args[0])
350 if response.code == OK:
351 self.set_header('Content-Type', info['mime'])
352 self.set_header('Content-Disposition',
353 'attachment; filename="{file}.{ext}"'.format(**args))
354 self.write(response.body)
355 elif response.code == CLIENT_TIMEOUT:
356 self.set_status(GATEWAY_TIMEOUT, reason='Capture is busy')
357 self.set_header('Content-Type', 'application/json')
358 self.write({'status': 'fail', 'msg': [
359 'Capture did not respond within timeout: %ds' % self.capture.timeout]})
360 else:
361 self.set_status(response.code, reason='capture.js error')
362 self.set_header('Content-Type', 'application/json')
363 self.write(response.body)