Coverage for gramex\cache.py : 80%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1'''Caching utilities'''
2import io
3import os
4import re
5import six
6import sys
7import json
8import time
9import atexit
10import inspect
11import requests
12import tempfile
13import mimetypes
14import subprocess # nosec
15import pandas as pd
16import tornado.template
17from threading import Thread
18from six.moves.queue import Queue
19from orderedattrdict import AttrDict
20from tornado.concurrent import Future
21from tornado.ioloop import IOLoop, PeriodicCallback
22from gramex.config import app_log, merge, used_kwargs, CustomJSONDecoder, CustomJSONEncoder
23from six.moves.urllib_parse import urlparse
26MILLISECOND = 0.001 # in seconds
27_opener_defaults = dict(mode='r', buffering=-1, encoding='utf-8', errors='strict',
28 newline=None, closefd=True)
29_markdown_defaults = dict(output_format='html5', extensions=[
30 'markdown.extensions.codehilite',
31 'markdown.extensions.extra',
32 'markdown.extensions.toc',
33 'markdown.extensions.meta',
34 'markdown.extensions.sane_lists',
35 'markdown.extensions.smarty',
36])
37# A set of temporary files to delete on program exit
38_TEMP_FILES = set()
39_ID_CACHE = set()
42def _delete_temp_files():
43 for path in _TEMP_FILES:
44 if os.path.exists(path):
45 os.remove(path)
48atexit.register(_delete_temp_files)
51def hashfn(fn):
52 '''Returns a unique hash value for the function.'''
53 # id() returns a unique value for the lifetime of an object.
54 # To ensure that ID is not re-cycled, cache object, so it's never released.
55 _ID_CACHE.add(fn)
56 return id(fn)
59def cache_key(*args):
60 '''Converts arguments into a string suitable for use as a cache key'''
61 return json.dumps(args, sort_keys=True, separators=(',', ':'))
64def opener(callback, read=False, **open_kwargs):
65 '''
66 Converts any function that accepts a string or handle as its parameter into
67 a function that takes the first parameter from a file path.
69 Here are a few examples::
71 jsonload = opener(json.load)
72 jsonload('x.json') # opens x.json and runs json.load(handle)
73 gramex.cache.open('x.json', jsonload) # Loads x.json, cached
75 # read=True parameter passes the contents (not handle) to the function
76 template = opener(string.Template, read=True)
77 template('abc.txt').substitute(x=val)
78 gramex.cache.open('abc.txt', template).substitute(x=val)
80 # If read=True, callback may be None. The result of .read() is passed as-is
81 text = opener(None, read=True)
82 gramex.cache.open('abc.txt', text)
84 Keyword arguments applicable for ``io.open`` are passed to ``io.open``. These
85 default to ``io.open(mode='r', buffering=-1, encoding='utf-8',
86 errors='strict', newline=None, closefd=True)``. All other arguments and
87 keyword arguments are passed to the callback (e.g. to ``json.load``).
89 When reading binary files, pass ``mode='rb', encoding=None, errors=None``.
90 '''
91 merge(open_kwargs, _opener_defaults, 'setdefault')
92 if read:
93 # Pass contents to callback
94 def method(path, **kwargs):
95 open_args = {key: kwargs.pop(key, val) for key, val in open_kwargs.items()}
96 with io.open(path, **open_args) as handle:
97 result = handle.read()
98 return callback(result, **kwargs) if callable(callback) else result
99 else:
100 if not callable(callback):
101 raise ValueError('opener callback %s not a function', repr(callback))
103 # Pass handle to callback
104 def method(path, **kwargs):
105 open_args = {key: kwargs.pop(key, val) for key, val in open_kwargs.items()}
106 with io.open(path, **open_args) as handle:
107 return callback(handle, **kwargs)
108 return method
111@opener
112def _markdown(handle, **kwargs):
113 from markdown import markdown
114 return markdown(handle.read(), **{k: kwargs.pop(k, v) for k, v in _markdown_defaults.items()})
117@opener
118def _yaml(handle, **kwargs):
119 import yaml
120 defaults = {'Loader': yaml.FullLoader}
121 return yaml.load(handle.read(), **{k: kwargs.pop(k, v) for k, v in defaults.items()})
124def _template(path, **kwargs):
125 root, name = os.path.split(path)
126 return tornado.template.Loader(root, **kwargs).load(name)
129def stat(path):
130 '''
131 Returns a file status tuple - based on file last modified time and file size
132 '''
133 if os.path.exists(path):
134 stat = os.stat(path)
135 return (stat.st_mtime, stat.st_size)
136 return (None, None)
139def hashed(val):
140 '''Return the hashed value of val. If not possible, return None'''
141 try:
142 hash(val)
143 return val
144 except TypeError:
145 try:
146 return json.dumps(val, sort_keys=True, separators=(',', ':'))
147 except Exception:
148 return None
151# gramex.cache.open() stores its cache here.
152# {(path, callback): {data: ..., stat: ...}}
153_OPEN_CACHE = {}
154_OPEN_CALLBACKS = dict(
155 bin=opener(None, read=True, mode='rb', encoding=None, errors=None),
156 txt=opener(None, read=True),
157 text=opener(None, read=True),
158 csv=pd.read_csv,
159 excel=pd.read_excel,
160 xls=pd.read_excel,
161 xlsx=pd.read_excel,
162 hdf=pd.read_hdf,
163 h5=pd.read_hdf,
164 html=pd.read_html,
165 jsondata=pd.read_json,
166 sas=pd.read_sas,
167 stata=pd.read_stata,
168 table=pd.read_table,
169 parquet=pd.read_parquet,
170 feather=pd.read_feather,
171 md=_markdown,
172 markdown=_markdown,
173 tmpl=_template,
174 template=_template,
175 yml=_yaml,
176 yaml=_yaml
177)
180def open(path, callback=None, transform=None, rel=False, **kwargs):
181 '''
182 Reads a file, processes it via a callback, caches the result and returns it.
183 When called again, returns the cached result unless the file has updated.
185 By default, it determine the file type using the extension. For example::
187 open('data.yaml') # Loads a YAML file
188 open('data.csv') # Loads a CSV file
190 The 2nd parameter (callback) accepts a predefined string that can be one of:
192 - ``bin``: reads binary files using io.open
193 - ``text`` or ``txt``: reads text files using io.open
194 - ``yaml``: reads files using yaml.load via io.open
195 - ``config``: reads files using using :py:class:`gramex.config.PathConfig`.
196 Same as ``yaml``, but allows ``import:`` and variable substitution.
197 - ``json``: reads files using json.load via io.open
198 - ``jsondata``: reads files using pd.read_json
199 - ``template``: reads files using tornado.Template via io.open
200 - ``markdown`` or ``md``: reads files using markdown.markdown via io.open
201 - ``csv``, ``excel``, ``xls``, `xlsx``, ``hdf``, ``h5``, ``html``, ``sas``,
202 ``stata``, ``table``, ``parquet``, ``feather``: reads using Pandas
203 - ``xml``, ``svg``, ``rss``, ``atom``: reads using lxml.etree
205 For example::
207 # Load data.yaml as YAML into an AttrDict
208 open('data.yaml', 'yaml')
210 # Load data.json as JSON into an AttrDict
211 open('data.json', 'json', object_pairs_hook=AttrDict)
213 # Load data.csv as CSV into a Pandas DataFrame
214 open('data.csv', 'csv', encoding='cp1252')
216 It can also be a function that accepts the filename and any other arguments::
218 # Load data using a custom callback
219 open('data.fmt', my_format_reader_function, arg='value')
221 This is called as ``my_format_reader_function('data.fmt', arg='value')`` and
222 cached. Future calls do not re-load and re-calculate this data.
224 ``transform=`` is an optioanl function that processes the data returned by
225 the callback. For example::
227 # Returns the count of the CSV file, updating it only when changed
228 open('data.csv', 'csv', transform=lambda data: len(data))
230 # After loading data.xlsx into a DataFrame, returned the grouped result
231 open('data.xlsx', 'xslx', transform=lambda data: data.groupby('city')['sales'].sum())
233 If ``transform=`` is not a callable, it is ignored.
235 ``rel=True`` opens the path relative to the caller function's file path. If
236 ``D:/app/calc.py`` calls ``open('data.csv', 'csv', rel=True)``, the path
237 is replaced with ``D:/app/data.csv``.
239 Any other keyword arguments are passed directly to the callback. If the
240 callback is a predefined string and uses io.open, all argument applicable to
241 io.open are passed to io.open and the rest are passed to the callback.
242 '''
243 # Pass _reload_status = True for testing purposes. This returns a tuple:
244 # (result, reloaded) instead of just the result.
245 _reload_status = kwargs.pop('_reload_status', False)
246 reloaded = False
247 _cache = kwargs.pop('_cache', _OPEN_CACHE)
249 # Get the parent frame's filename. Compute path relative to that.
250 if rel:
251 stack = inspect.getouterframes(inspect.currentframe(), 2)
252 folder = os.path.dirname(os.path.abspath(stack[1][1]))
253 path = os.path.join(folder, path)
255 original_callback = callback
256 if callback is None:
257 callback = os.path.splitext(path)[-1][1:]
258 callback_is_str = isinstance(callback, six.string_types)
259 key = (
260 path,
261 original_callback if callback_is_str else id(callback),
262 hashfn(transform),
263 frozenset(((k, hashed(v)) for k, v in kwargs.items())),
264 )
265 cached = _cache.get(key, None)
266 fstat = stat(path)
267 if cached is None or fstat != cached.get('stat'):
268 reloaded = True
269 if callable(callback):
270 data = callback(path, **kwargs)
271 elif callback_is_str:
272 method = None
273 if callback in _OPEN_CALLBACKS:
274 method = _OPEN_CALLBACKS[callback]
275 elif callback in {'json'}:
276 import json
277 method = opener(json.load)
278 elif callback in {'config'}:
279 from gramex.config import PathConfig
280 method = PathConfig
281 elif callback in {'xml', 'svg', 'rss', 'atom'}:
282 from lxml import etree
283 method = etree.parse
285 if method is not None:
286 data = method(path, **kwargs)
287 elif original_callback is None:
288 raise TypeError('gramex.cache.open: path "%s" has unknown extension' % path)
289 else:
290 raise TypeError('gramex.cache.open(callback="%s") is not a known type' % callback)
291 else:
292 raise TypeError('gramex.cache.open(callback=) must be a function, not %r' % callback)
293 if callable(transform):
294 data = transform(data)
295 _cache[key] = {'data': data, 'stat': fstat}
297 result = _cache[key]['data']
298 return (result, reloaded) if _reload_status else result
301def set_cache(cache, old_cache):
302 '''
303 Use ``cache`` as the new cache for all open requests.
304 Copies keys from old cache, and deletes them from the old cache.
305 '''
306 for key in list(old_cache.keys()):
307 cache[key] = old_cache[key]
308 del old_cache[key]
309 return cache
312_SAVE_CALLBACKS = dict(
313 json='to_json',
314 csv='to_csv',
315 xlsx='to_excel',
316 hdf='to_hdf',
317 html='to_html',
318 stata='to_stata',
319 # Other configurations not supported
320)
323def save(data, url, callback=None, **kwargs):
324 '''
325 Saves a DataFrame into file at url. It does not cache.
327 ``callback`` is almost the same as for :py:func:`gramex.cache.open`. It can
328 be ``json``, ``csv``, ``xlsx``, ``hdf``, ``html``, ``stata`` or
329 a function that accepts the filename and any other arguments.
331 Other keyword arguments are passed directly to the callback.
332 '''
333 if callback is None: 333 ↛ 335line 333 didn't jump to line 335, because the condition on line 333 was never false
334 callback = os.path.splitext(url)[-1][1:]
335 if callable(callback): 335 ↛ 336line 335 didn't jump to line 336, because the condition on line 335 was never true
336 return callback(data, url, **kwargs)
337 elif callback in _SAVE_CALLBACKS: 337 ↛ 341line 337 didn't jump to line 341, because the condition on line 337 was never false
338 method = getattr(data, _SAVE_CALLBACKS[callback])
339 return method(url, **(used_kwargs(method, kwargs)[0]))
340 else:
341 raise TypeError('gramex.cache.save(callback="%s") is unknown' % callback)
344# gramex.cache.query() stores its cache here
345_QUERY_CACHE = {}
346_STATUS_METHODS = {}
349def _wheres(dbkey, tablekey, default_db, names, fn=None):
350 '''
351 Convert a table name list like ['sales', 'dept.sales']) to a WHERE clause
352 like ``(table="sales") OR (db="dept" AND table="sales")``.
354 TODO: escape the table names to avoid SQL injection attacks
355 '''
356 where = []
357 for name in names:
358 db, table = name.rsplit('.', 2) if '.' in name else (default_db, name)
359 if not fn: 359 ↛ 362line 359 didn't jump to line 362, because the condition on line 359 was never false
360 where.append("({}='{}' AND {}='{}')".format(dbkey, db, tablekey, table))
361 else:
362 where.append("({}={}('{}') AND {}={}('{}'))".format(
363 dbkey, fn[0], db, tablekey, fn[1], table))
364 return ' OR '.join(where)
367def _table_status(engine, tables):
368 '''
369 Returns the last updated date of a list of tables.
370 '''
371 # Cache the SQL query or file date check function beforehand.
372 # Every time method is called with a URL and table list, run cached query
373 dialect = engine.dialect.name
374 key = (engine.url, tuple(tables))
375 db = engine.url.database
376 if _STATUS_METHODS.get(key, None) is None:
377 if len(tables) == 0:
378 raise ValueError('gramex.cache.query table list is empty: %s', repr(tables))
379 for name in tables:
380 if not name or not isinstance(name, six.string_types):
381 raise ValueError('gramex.cache.query invalid table list: %s', repr(tables))
382 if dialect == 'mysql': 382 ↛ 385line 382 didn't jump to line 385, because the condition on line 382 was never true
383 # https://dev.mysql.com/doc/refman/5.7/en/tables-table.html
384 # Works only on MySQL 5.7 and above
385 q = ('SELECT update_time FROM information_schema.tables WHERE ' +
386 _wheres('table_schema', 'table_name', db, tables))
387 elif dialect == 'mssql': 387 ↛ 389line 387 didn't jump to line 389, because the condition on line 387 was never true
388 # https://goo.gl/b4aL9m
389 q = ('SELECT last_user_update FROM sys.dm_db_index_usage_stats WHERE ' +
390 _wheres('database_id', 'object_id', db, tables, fn=['DB_ID', 'OBJECT_ID']))
391 elif dialect == 'postgresql':
392 # https://www.postgresql.org/docs/9.6/static/monitoring-stats.html
393 q = ('SELECT n_tup_ins, n_tup_upd, n_tup_del FROM pg_stat_all_tables WHERE ' +
394 _wheres('schemaname', 'relname', 'public', tables))
395 elif dialect == 'sqlite': 395 ↛ 400line 395 didn't jump to line 400, because the condition on line 395 was never false
396 if not db: 396 ↛ 397line 396 didn't jump to line 397, because the condition on line 396 was never true
397 raise KeyError('gramex.cache.query does not support memory sqlite "%s"' % dialect)
398 q = db
399 else:
400 raise KeyError('gramex.cache.query cannot cache dialect "%s" yet' % dialect)
401 if dialect == 'sqlite':
402 _STATUS_METHODS[key] = lambda: stat(q)
403 else:
404 _STATUS_METHODS[key] = lambda: pd.read_sql(q, engine).to_json(orient='records')
405 return _STATUS_METHODS[key]()
408def query(sql, engine, state=None, **kwargs):
409 '''
410 Read SQL query or database table into a DataFrame. Caches results unless
411 state has changed. It always re-runs the query unless state is specified.
413 The state can be specified in 3 ways:
415 1. A string. This must be as a lightweight SQL query. If the result changes,
416 the original SQL query is re-run.
417 2. A function. This is called to determine the state of the database.
418 3. A list of tables. This list of ["db.table"] names specifies which tables
419 to watch for. This is currently experimental.
420 4. ``None``: the default. The query is always re-run and not cached.
421 '''
422 # Pass _reload_status = True for testing purposes. This returns a tuple:
423 # (result, reloaded) instead of just the result.
424 _reload_status = kwargs.pop('_reload_status', False)
425 reloaded = False
426 _cache = kwargs.pop('_cache', _QUERY_CACHE)
427 store_cache = True
429 key = (str(sql), json.dumps(kwargs.get('params', {}), sort_keys=True), engine.url)
430 current_status = _cache.get(key, {}).get('status', None)
431 if isinstance(state, (list, tuple)):
432 status = _table_status(engine, tuple(state))
433 elif isinstance(state, six.string_types):
434 status = pd.read_sql(state, engine).to_dict(orient='list')
435 elif callable(state):
436 status = state()
437 elif state is None:
438 # Create a new status every time, so that the query is always re-run
439 status = object()
440 store_cache = False
441 else:
442 raise TypeError('gramex.cache.query(state=) must be a table list, query or fn, not %s',
443 repr(state))
445 if status == current_status:
446 result = _cache[key]['data']
447 else:
448 app_log.debug('gramex.cache.query: %s. engine: %s. state: %s. kwargs: %s', sql, engine,
449 state, kwargs)
450 result = pd.read_sql(sql, engine, **kwargs)
451 if store_cache:
452 _cache[key] = {
453 'data': result,
454 'status': status,
455 }
456 reloaded = True
458 return (result, reloaded) if _reload_status else result
461# gramex.cache.reload_module() stores its cache here. {module_name: file_stat}
462_MODULE_CACHE = {}
465def reload_module(*modules):
466 '''
467 Reloads one or more modules if they are outdated, i.e. only if required the
468 underlying source file has changed.
470 For example::
472 import mymodule # Load cached module
473 reload_module(mymodule) # Reload module if the source has changed
475 This is most useful during template development. If your changes are in a
476 Python module, add adding these lines to pick up new module changes when
477 the template is re-run.
478 '''
479 for module in modules:
480 name = getattr(module, '__name__', None)
481 path = getattr(module, '__file__', None)
482 # sys.__file__ does not exist, but don't raise a warning. You can't reload it
483 if name in {'sys'}: 483 ↛ 484line 483 didn't jump to line 484, because the condition on line 483 was never true
484 continue
485 if name is None or path is None or not os.path.exists(path): 485 ↛ 486line 485 didn't jump to line 486, because the condition on line 485 was never true
486 app_log.warning('Path for module %s is %s: not found', name, path)
487 continue
488 # On Python 3, __file__ points to the .py file. In Python 2, it's the .pyc file
489 # https://www.python.org/dev/peps/pep-3147/#file
490 if path.lower().endswith('.pyc'): 490 ↛ 491line 490 didn't jump to line 491, because the condition on line 490 was never true
491 path = path[:-1]
492 if not os.path.exists(path):
493 app_log.warning('Path for module %s is %s: not found', name, path)
494 continue
495 # The first time, don't reload it. Thereafter, if it's older or resized, reload it
496 fstat = stat(path)
497 if fstat != _MODULE_CACHE.get(name, fstat):
498 app_log.info('Reloading module %s', name)
499 six.moves.reload_module(module)
500 _MODULE_CACHE[name] = fstat
503def urlfetch(path, info=False, **kwargs):
504 '''
505 - If path is a file path, return as is.
506 - If path is a file path and info is true, return a dict with name (filepath),
507 ext (extension), and content_type as well as r, url set to None.
508 - If path is a URL, download the file, return the saved filename.
509 The filename extension is based on the URL's Content-Type HTTP header.
510 - If info is true, returns a dict with name (filename), r (request)
511 url, ext (extension), content_type.
512 - Any other keyword arguments are passed to requests.get.
513 - Automatically delete the files on exit of the application.
514 - This is a synchronous function, i.e. it waits until the file is downloaded.
515 '''
516 url = urlparse(path)
517 if url.scheme not in {'http', 'https'}: # path is a filepath
518 if info:
519 ext = os.path.splitext(path)[1]
520 content_type = mimetypes.guess_type(path, strict=True)[0]
521 return {'name': path, 'r': None, 'url': None, 'ext': ext, 'content_type': content_type}
522 else:
523 return path
524 r = requests.get(path, **kwargs)
525 if 'Content-Type' in r.headers:
526 content_type = r.headers['Content-Type'].split(';')[0]
527 ext = mimetypes.guess_extension(content_type, strict=False)
528 else:
529 ext = os.path.splitext(url.path)[1]
530 with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as handle:
531 for chunk in r.iter_content(chunk_size=16384):
532 handle.write(chunk)
533 _TEMP_FILES.add(handle.name)
534 if info:
535 return {'name': handle.name, 'r': r, 'url': url, 'ext': ext, 'content_type': content_type}
536 else:
537 return handle.name
540class Subprocess(object):
541 '''
542 tornado.process.Subprocess does not work on Windows.
543 https://github.com/tornadoweb/tornado/issues/1585
545 This is a threaded alternative based on
546 http://stackoverflow.com/a/4896288/100904
548 Run a program async and wait for it to execute. Then get its output::
550 stdout, stderr = yield Subprocess(['ls', '-la']).wait_for_exit()
552 Run a program async and send each line to the handler as it writes::
554 yield Subprocess(
555 ['ls', '-la'], # Run 'ls -la'
556 buffer_size='line', # Buffer output line by line
557 stream_stdout=handler.write, # Send output to handler.write(line)
558 stream_stderr=handler.write, # Send errors to handler.write(line)
559 )
561 Run a program async and appends output into a list::
563 proc = Subprocess(
564 ['ls', '-la'],
565 buffer_size='line',
566 stream_stdout='list_out', # Append output to self.list_out
567 stream_stderr='list_err', # Append errors to self.list_err
568 )
569 output = proc.list_out[-10:] # Return last 10 lines of output
570 yield proc.wait_for_exit() # Wait until application is done
572 Run a program async and appends output into a queue::
574 proc = Subprocess(
575 ['ls', '-la'], # Run 'ls -la'
576 buffer_size='line', # Buffer output line by line
577 stream_stdout='queue_out', # Save output in proc.out queue
578 stream_stderr='queue_err', # Save errors in proc.err queue
579 )
580 output = proc.queue_out.get_nowait() # Returns first line of output
581 yield proc.wait_for_exit() # Wait until application is done
583 To write to multiple streams, pass a list::
585 proc = Subprocess(
586 args,
587 buffer_size='line',
588 stream_stdout=[handler.write, 'list_out', 'queue_out', my_callback],
589 stream_stderr=[handler.write, 'list_err', 'queue_err', my_callback],
590 **kwargs
591 )
592 yield proc.wait_for_exit()
594 To check the process return code, use ``.proc`` which has the ``Popen``
595 object::
597 if proc.proc.returncode:
598 raise Exception('Process failed with return code %d', proc.proc.returncode)
600 :arg list args: command line arguments passed as a list to Subprocess
601 :arg methodlist stream_stdout: optional list of write methods - called when stdout has data
602 :arg methodlist stream_stderr: optional list of write methods - called when stderr has data
603 :arg str_or_int buffer_size: 'line' to write line by line, any int for chunk size
604 :arg dict kwargs: additional kwargs passed to subprocess.Popen
606 stream_stdout and stream_stderr can be:
608 - a function that accept a byte string. Called as stdout/stderr are buffered
609 - OR a string starting with ``list_`` or ``queue_``. Appends buffered output
610 - OR a list of any of the above
611 - OR an empty list. In this case, ``.wait_for_exit()`` returns a tuple with
612 ``stdout`` and ``stderr`` as a tuple of byte strings.
613 '''
615 def __init__(self, args, stream_stdout=[], stream_stderr=[], buffer_size=0, **kwargs):
616 self.args = args
618 # self.proc.stdout & self.proc.stderr are streams with process output
619 kwargs['stdout'] = kwargs['stderr'] = subprocess.PIPE
621 # On UNIX, close all file descriptors except 0, 1, 2 before child
622 # process is executed. I've no idea why. Copied from
623 # http://stackoverflow.com/a/4896288/100904
624 kwargs['close_fds'] = 'posix' in sys.builtin_module_names
626 self.proc = subprocess.Popen(args, **kwargs) # nosec
627 self.thread = {} # Has the running threads
628 self.future = {} # Stores the futures indicating stream close
629 self.loop = _get_current_ioloop()
631 # Buffering has 2 modes. buffer_size='line' reads and writes line by line
632 # buffer_size=<number> reads in byte chunks. Define the appropriate method
633 if hasattr(buffer_size, 'lower') and 'line' in buffer_size.lower():
634 def _write(stream, callbacks, future, retval):
635 '''Call callbacks with content from stream. On EOF mark future as done'''
636 while True:
637 content = stream.readline()
638 if len(content) > 0:
639 if isinstance(content, six.text_type):
640 content = content.encode('utf-8')
641 for callback in callbacks:
642 callback(content)
643 else:
644 stream.close()
645 break
646 while self.proc.poll() is None:
647 time.sleep(MILLISECOND)
648 self.loop.add_callback(future.set_result, retval())
649 else:
650 # If the buffer size is 0 or negative, use the default buffer size to read
651 if buffer_size <= 0:
652 buffer_size = io.DEFAULT_BUFFER_SIZE
654 def _write(stream, callbacks, future, retval):
655 '''Call callbacks with content from stream. On EOF mark future as done'''
656 while True:
657 content = stream.read(buffer_size)
658 size = len(content)
659 if size > 0:
660 if isinstance(content, six.text_type):
661 content = content.encode('utf-8')
662 for callback in callbacks:
663 # This may raise a ValueError: write to closed file.
664 # TODO: decide how to handle it.
665 callback(content)
666 if size < buffer_size:
667 stream.close()
668 break
669 while self.proc.poll() is None:
670 time.sleep(MILLISECOND)
671 self.loop.add_callback(future.set_result, retval())
673 callbacks_lookup = {'stdout': stream_stdout, 'stderr': stream_stderr}
674 for stream in ('stdout', 'stderr'):
675 callbacks = callbacks_lookup[stream]
676 # If stream_stdout or stream_stderr are not defined, construct a
677 # BytesIO and return its value when the stream is closed
678 if not callbacks:
679 ret_stream = io.BytesIO()
680 callbacks = [ret_stream.write]
681 retval = ret_stream.getvalue
682 else:
683 retval = lambda: b'' # noqa
684 # If stream_stdout or stream_stderr has 'out' or 'err', create these
685 # as queue attributes (self.out, self.err)
686 callbacks = list(callbacks) if isinstance(callbacks, list) else [callbacks]
687 for index, method in enumerate(callbacks):
688 if isinstance(method, six.string_types):
689 if method.startswith('list_'):
690 if hasattr(self, method):
691 callbacks[index] = getattr(self, method).append
692 else:
693 log = []
694 setattr(self, method, log)
695 callbacks[index] = log.append
696 elif method.startswith('queue_'): 696 ↛ 704line 696 didn't jump to line 704, because the condition on line 696 was never false
697 if hasattr(self, method):
698 callbacks[index] = getattr(self, method).put
699 else:
700 log = Queue()
701 setattr(self, method, log)
702 callbacks[index] = log.put
703 else:
704 raise ValueError('Invalid stream_%s: %s', stream, method)
705 self.future[stream] = future = Future()
706 # Thread writes from self.proc.stdout / stderr to appropriate callbacks
707 self.thread[stream] = t = Thread(
708 target=_write,
709 args=(getattr(self.proc, stream), callbacks, future, retval))
710 t.daemon = True # Thread dies with the program
711 t.start()
713 def wait_for_exit(self):
714 '''
715 Returns futures for (stdout, stderr). To wait for the process to complete, use::
717 stdout, stderr = yield proc.wait_for_exit()
718 '''
719 return [self.future['stdout'], self.future['stderr']]
722_daemons = {}
723_regex_type = type(re.compile(''))
724# Python 3 needs sys.stderr.buffer.write for writing binary strings
725_stderr_write = sys.stderr.buffer.write if hasattr(sys.stderr, 'buffer') else sys.stderr.write
728def daemon(args, restart=1, first_line=None, stream=True, timeout=5, buffer_size='line', **kwargs):
729 '''
730 This is the same as :py:class:`Subprocess`, but has a few additional checks.
732 1. If we have already called :py:class:`Subprocess` with the same arguments,
733 re-use the same instance.
734 2. Send the process STDOUT and STDERR to this application's STDERR. This
735 makes it easy to see what errors the application reports.
736 3. Supports retry attempts.
737 4. Checks if the first line of output is a matches a string / re -- ensuring
738 that the application started properly.
739 '''
740 arg_str = args if isinstance(args, six.string_types) else ' '.join(args)
741 try:
742 key = cache_key(arg_str, kwargs)
743 except (TypeError, ValueError):
744 app_log.error('daemon args must be JSON serializable')
745 raise
746 # Send the stdout and stderr to (a) stderr AND to (b) a local queue we read
747 queue = Queue(maxsize=10)
748 for channel in ('stream_stdout', 'stream_stderr'):
749 if channel not in kwargs: 749 ↛ 751line 749 didn't jump to line 751, because the condition on line 749 was never false
750 kwargs[channel] = []
751 elif not isinstance(kwargs[channel], list):
752 kwargs[channel] = [kwargs[channel]]
753 if first_line:
754 kwargs[channel].append(queue.put)
755 if stream is True:
756 kwargs[channel].append(_stderr_write)
757 elif callable(stream): 757 ↛ 748line 757 didn't jump to line 748, because the condition on line 757 was never false
758 kwargs[channel].append(stream)
759 # Buffer by line by default. This is required for the first_line check, not otherwise.
760 kwargs['buffer_size'] = buffer_size
761 # started is set if we actually call Subprocess as part of this function
762 started = False
764 # If process was never started, start it
765 if key not in _daemons:
766 started = _daemons[key] = Subprocess(args, **kwargs)
768 # Ensure that process is running. Restart if required
769 proc = _daemons[key]
770 restart = int(restart)
771 while proc.proc.returncode is not None and restart > 0:
772 restart -= 1
773 proc = started = _daemons[key] = Subprocess(args, **kwargs)
774 if proc.proc.returncode is not None: 774 ↛ 775line 774 didn't jump to line 775, because the condition on line 774 was never true
775 raise RuntimeError('Error %d starting %s' % (proc.proc.returncode, arg_str))
776 if started:
777 app_log.info('Started: %s', arg_str)
779 future = Future()
780 # If process was started, wait until it has initialized. Else just return the proc
781 if first_line and started:
782 if isinstance(first_line, six.string_types):
783 def check(proc):
784 actual = queue.get(timeout=timeout).decode('utf-8')
785 if first_line not in actual:
786 raise AssertionError('%s: wrong first line: %s (no "%s")' %
787 (arg_str, actual, first_line))
788 elif isinstance(first_line, _regex_type): 788 ↛ 793line 788 didn't jump to line 793, because the condition on line 788 was never false
789 def check(proc):
790 actual = queue.get(timeout=timeout).decode('utf-8')
791 if not first_line.search(actual): 791 ↛ 792line 791 didn't jump to line 792, because the condition on line 791 was never true
792 raise AssertionError('%s: wrong first line: %s' % (arg_str, actual))
793 elif callable(first_line):
794 check = first_line
795 loop = _get_current_ioloop()
797 def checker(proc):
798 try:
799 check(proc)
800 except Exception as e:
801 loop.add_callback(future.set_exception, e)
802 else:
803 loop.add_callback(future.set_result, proc)
805 proc._check_thread = t = Thread(target=checker, args=(proc, ))
806 t.daemon = True # Thread dies with the program
807 t.start()
808 else:
809 future.set_result(proc)
810 return future
813def _get_current_ioloop():
814 '''
815 Return the current IOLoop. But if we're not already in an IOLoop, return an
816 object that mimics add_callback() by running the method immediately.
817 This allows daemon() to be run without Tornado / asyncio.
818 '''
819 loop = IOLoop.current(instance=False)
820 if loop is None:
821 loop = AttrDict(add_callback=lambda fn, *args, **kwargs: fn(*args, **kwargs))
822 return loop
825def get_store(type, **kwargs):
826 if type == 'memory': 826 ↛ 827line 826 didn't jump to line 827, because the condition on line 826 was never true
827 return KeyStore(**kwargs)
828 elif type == 'sqlite':
829 return SQLiteStore(**kwargs)
830 elif type == 'json': 830 ↛ 832line 830 didn't jump to line 832, because the condition on line 830 was never false
831 return JSONStore(**kwargs)
832 elif type == 'redis':
833 return RedisStore(**kwargs)
834 elif type == 'hdf5':
835 return HDF5Store(**kwargs)
836 else:
837 raise NotImplementedError('Store type: %s not implemented' % type)
840class KeyStore(object):
841 '''
842 Base class for persistent dictionaries. (But KeyStore is not persistent.)
844 >>> store = KeyStore()
845 >>> value = store.load(key, None) # Load a value. It's like dict.get()
846 >>> store.dump(key, value) # Save a value. It's like dict.set(), but doesn't flush
847 >>> store.flush() # Saves to disk
848 >>> store.close() # Close the store
850 You can initialize a KeyStore with a ``flush=`` parameter. The store is
851 flushed to disk via ``store.flush()`` every ``flush`` seconds.
853 If a ``purge=`` is provided, the data is purged of missing values every
854 ``purge`` seconds. You can provide a custom ``purge_keys=`` function that
855 returns an iterator of keys to delete if any.
857 When the program exits, ``.close()`` is automatically called.
858 '''
860 def __init__(self, flush=None, purge=None, purge_keys=None, **kwargs):
861 '''Initialise the KeyStore at path'''
862 self.store = {}
863 if callable(purge_keys):
864 self.purge_keys = purge_keys
865 elif purge_keys is not None: 865 ↛ 866line 865 didn't jump to line 866, because the condition on line 865 was never true
866 app_log.error(
867 'KeyStore: purge_keys=%r invalid. Must be function(dict)',
868 purge_keys)
869 # Periodically flush and purge buffers
870 if flush is not None:
871 PeriodicCallback(self.flush, callback_time=flush * 1000).start()
872 if purge is not None:
873 PeriodicCallback(self.purge, callback_time=purge * 1000).start()
874 # Call close() when Python gracefully exits
875 atexit.register(self.close)
877 def keys(self):
878 '''Return all keys in the store'''
879 return self.store.keys()
881 def load(self, key, default=None):
882 '''Same as store.get(), but called "load" to indicate persistence'''
883 key = self._escape(key)
884 return self.store.get(key, {} if default is None else default)
886 def dump(self, key, value):
887 '''Same as store[key] = value'''
888 key = self._escape(key)
889 self.store[key] = value
891 def _escape(self, key):
892 '''Converts key into a unicode string (interpreting byte-string keys as UTF-8)'''
893 if isinstance(key, six.binary_type): 893 ↛ 894line 893 didn't jump to line 894, because the condition on line 893 was never true
894 return six.text_type(key, encoding='utf-8')
895 return six.text_type(key)
897 @staticmethod
898 def purge_keys(data):
899 return [key for key, val in data.items() if val is None]
901 def flush(self):
902 '''Write to disk'''
903 pass
905 def purge(self):
906 '''Delete empty keys and flush'''
907 for key in self.purge_keys(self.store):
908 try:
909 del self.store[key]
910 except KeyError:
911 # If the key was already removed from store, ignore
912 pass
913 self.flush()
915 def close(self):
916 '''Flush and close all open handles'''
917 raise NotImplementedError()
920class RedisStore(KeyStore):
921 '''
922 A KeyStore that stores data in a Redis database. Typical usage::
924 >>> store = RedisStore('localhost:6379:1:password=x:...') # host:port:db:params
925 >>> value = store.load(key)
926 >>> store.dump(key, value)
928 The path in the constructor contains parameters separated by colon (:):
930 - `host`: the Redis server location (default: localhost)
931 - `port`: the Redis server port (default: 6379)
932 - `db`: the Redis server DB number (default: 0)
933 - zero or more parameters passed to StrictRedis (e.g. password=abc)
935 Values are encoded as JSON using gramex.config.CustomJSONEncoder (thus
936 handling datetime.) Keys are JSON encoded.
937 '''
939 def __init__(self, path=None, *args, **kwargs):
940 super(RedisStore, self).__init__(*args, **kwargs)
941 from redis import StrictRedis
942 host, port, db, redis_kwargs = 'localhost', 6379, 0, {}
943 if isinstance(path, six.string_types):
944 parts = path.split(':')
945 if len(parts):
946 host = parts.pop(0)
947 if len(parts):
948 port = int(parts.pop(0))
949 if len(parts):
950 db = int(parts.pop(0))
951 redis_kwargs = dict(part.split('=', 2) for part in parts)
952 redis_kwargs['decode_responses'] = True
953 redis_kwargs.setdefault('encoding', 'utf-8')
954 self.store = StrictRedis(host=host, port=port, db=db, **redis_kwargs)
956 def load(self, key, default=None):
957 result = self.store.get(key)
958 if result is None:
959 return default
960 try:
961 return json.loads(
962 result, object_pairs_hook=AttrDict, cls=CustomJSONDecoder)
963 except ValueError:
964 app_log.error('RedisStore("%s").load("%s") is not JSON ("%r..."")',
965 self.store, key, result)
966 return default
968 def dump(self, key, value):
969 if value is None:
970 self.store.delete(key)
971 else:
972 value = json.dumps(
973 value,
974 ensure_ascii=True,
975 separators=(',', ':'),
976 cls=CustomJSONEncoder)
977 self.store.set(key, value)
979 def close(self):
980 pass
982 def purge(self):
983 app_log.debug('Purging %s', self.store)
984 # TODO: optimize item retrieval
985 items = {key: self.load(key, None) for key in self.store.keys()}
986 for key in self.purge_keys(items):
987 self.store.delete(key)
990class SQLiteStore(KeyStore):
991 '''
992 A KeyStore that stores data in a SQLite file. Typical usage::
994 >>> store = SQLiteStore('file.db', table='store')
995 >>> value = store.load(key)
996 >>> store.dump(key, value)
998 Values are encoded as JSON using gramex.config.CustomJSONEncoder (thus
999 handling datetime.) Keys are JSON encoded.
1000 '''
1002 def __init__(self, path, table='store', *args, **kwargs):
1003 super(SQLiteStore, self).__init__(*args, **kwargs)
1004 self.path = _create_path(path)
1005 from sqlitedict import SqliteDict
1006 self.store = SqliteDict(
1007 self.path, tablename=table, autocommit=True,
1008 encode=lambda v: json.dumps(v, separators=(',', ':'), ensure_ascii=True,
1009 cls=CustomJSONEncoder),
1010 decode=lambda v: json.loads(v, object_pairs_hook=AttrDict, cls=CustomJSONDecoder),
1011 )
1013 def close(self):
1014 self.store.close()
1016 def flush(self):
1017 super(SQLiteStore, self).flush()
1018 self.store.commit()
1020 def keys(self):
1021 # Keys need to be escaped
1022 return (self._escape(key) for key in self.store.keys())
1024 def purge(self):
1025 app_log.debug('Purging %s', self.path)
1026 super(SQLiteStore, self).purge()
1029class HDF5Store(KeyStore):
1030 '''
1031 A KeyStore that stores data in a HDF5 file. Typical usage::
1033 >>> store = HDF5Store('file.h5', flush=15)
1034 >>> value = store.load(key)
1035 >>> store.dump(key, value)
1037 Internally, it uses HDF5 groups to store data. Values are encoded as JSON
1038 using gramex.config.CustomJSONEncoder (thus handling datetime.) Keys are JSON
1039 encoded, and '/' is escaped as well (since HDF5 groups treat / as subgroups.)
1040 '''
1042 def __init__(self, path, *args, **kwargs):
1043 super(HDF5Store, self).__init__(*args, **kwargs)
1044 self.path = _create_path(path)
1045 self.changed = False
1046 import h5py
1047 # h5py.File fails with OSError: Unable to create file (unable to open file: name =
1048 # '.meta.h5', errno = 17, error message = 'File exists', flags = 15, o_flags = 502)
1049 # TODO: identify why this happens and resolve it.
1050 self.store = h5py.File(self.path, 'a')
1052 def load(self, key, default=None):
1053 # Keys cannot contain / in HDF5 store. Escape it
1054 key = self._escape(key).replace('/', '\t')
1055 result = self.store.get(key, None)
1056 if result is None:
1057 return default
1058 try:
1059 return json.loads(
1060 result.value,
1061 object_pairs_hook=AttrDict,
1062 cls=CustomJSONDecoder)
1063 except ValueError:
1064 app_log.error('HDF5Store("%s").load("%s") is not JSON ("%r..."")',
1065 self.path, key, result.value)
1066 return default
1068 def dump(self, key, value):
1069 key = self._escape(key)
1070 if self.store.get(key) != value: 1070 ↛ exitline 1070 didn't return from function 'dump', because the condition on line 1070 was never false
1071 if key in self.store:
1072 del self.store[key]
1073 self.store[key] = json.dumps(
1074 value,
1075 ensure_ascii=True,
1076 separators=(',', ':'),
1077 cls=CustomJSONEncoder)
1078 self.changed = True
1080 def _escape(self, key):
1081 '''
1082 Converts key into a unicode string (interpreting byte-string keys as UTF-8).
1083 HDF5 does not accept / in key names. Replace those with tabs.
1084 '''
1085 if isinstance(key, six.binary_type): 1085 ↛ 1086line 1085 didn't jump to line 1086, because the condition on line 1085 was never true
1086 key = six.text_type(key, encoding='utf-8')
1087 else:
1088 key = six.text_type(key)
1089 return key.replace('/', '\t')
1091 def keys(self):
1092 # Keys cannot contain / in HDF5 store. Unescape it
1093 return (key.replace('\t', '/') for key in self.store.keys()) 1093 ↛ exitline 1093 didn't finish the generator expression on line 1093
1095 def flush(self):
1096 super(HDF5Store, self).flush()
1097 if self.changed:
1098 app_log.debug('Flushing %s', self.path)
1099 self.store.flush()
1100 self.changed = False
1102 def purge(self):
1103 '''
1104 Load all keys into self.store. Delete what's required. Save.
1105 '''
1106 self.flush()
1107 changed = False
1108 items = {
1109 key: json.loads(
1110 val.value, object_pairs_hook=AttrDict, cls=CustomJSONDecoder)
1111 for key, val in self.store.items()
1112 }
1113 for key in self.purge_keys(items):
1114 del self.store[key]
1115 changed = True
1116 if changed:
1117 app_log.debug('Purging %s', self.path)
1118 self.store.flush()
1120 def close(self):
1121 try:
1122 self.store.close()
1123 # h5py.h5f.get_obj_ids often raises a ValueError: Not a file id.
1124 # This is presumably if the file handle has been closed. Log & ignore.
1125 except ValueError:
1126 app_log.debug('HDF5Store("%s").close() error ignored', self.path)
1129class JSONStore(KeyStore):
1130 '''
1131 A KeyStore that stores data in a JSON file. Typical usage::
1133 >>> store = JSONStore('file.json', flush=15)
1134 >>> value = store.load(key)
1135 >>> store.dump(key, value)
1137 This is less efficient than HDF5Store for large data, but is human-readable.
1138 They also cannot support multiple instances. Only one JSONStore instance
1139 is permitted per file.
1140 '''
1142 def __init__(self, path, *args, **kwargs):
1143 super(JSONStore, self).__init__(*args, **kwargs)
1144 self.path = _create_path(path)
1145 self.store = self._read_json()
1146 self.changed = False
1147 self.update = {} # key-values added since flush
1149 def _read_json(self):
1150 try:
1151 with io.open(self.path) as handle: # noqa: no encoding for json
1152 return json.load(handle, cls=CustomJSONDecoder)
1153 except (IOError, ValueError):
1154 return {}
1156 def _write_json(self, data):
1157 json_value = json.dumps(
1158 data,
1159 ensure_ascii=True,
1160 separators=(',', ':'),
1161 cls=CustomJSONEncoder)
1162 with io.open(self.path, 'w') as handle: # noqa: no encoding for json
1163 handle.write(json_value)
1165 def dump(self, key, value):
1166 '''Same as store[key] = value'''
1167 key = self._escape(key)
1168 if self.store.get(key) != value:
1169 self.store[key] = value
1170 self.update[key] = value
1171 self.changed = True
1173 def flush(self):
1174 super(JSONStore, self).flush()
1175 if self.changed:
1176 app_log.debug('Flushing %s', self.path)
1177 store = self._read_json()
1178 store.update(self.update)
1179 self._write_json(store)
1180 self.store = store
1181 self.update = {}
1182 self.changed = False
1184 def purge(self):
1185 '''
1186 Load all keys into self.store. Delete what's required. Save.
1187 '''
1188 self.flush()
1189 changed = False
1190 for key in self.purge_keys(self.store):
1191 del self.store[key]
1192 changed = True
1193 if changed:
1194 app_log.debug('Purging %s', self.path)
1195 self._write_json(self.store)
1197 def close(self):
1198 try:
1199 self.flush()
1200 # This has happened when the directory was deleted. Log & ignore.
1201 except OSError:
1202 app_log.error('Cannot flush %s', self.path)
1205def _create_path(path):
1206 # Ensure that path directory exists
1207 path = os.path.abspath(path)
1208 folder = os.path.dirname(path)
1209 if not os.path.exists(folder):
1210 os.makedirs(folder)
1211 return path
1214def sizeof(obj):
1215 if isinstance(obj, dict):
1216 return sys.getsizeof(obj) + sum(sizeof(k) + sizeof(v) for k, v in obj.items())
1217 elif isinstance(obj, (set, list)):
1218 return sys.getsizeof(obj) + sum(sizeof(v) for v in obj)
1219 return sys.getsizeof(obj)