Coverage for gramex\data.py : 92%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1'''
2Interact with data from the browser
3'''
4import io
5import os
6import re
7import six
8import time
9import json
10import sqlalchemy
11import pandas as pd
12import gramex.cache
13from tornado.escape import json_encode
14from sqlalchemy.sql import text
15from gramex.config import merge, app_log
16from orderedattrdict import AttrDict
18_ENGINE_CACHE = {}
19_METADATA_CACHE = {}
20_FOLDER = os.path.dirname(os.path.abspath(__file__))
21# Dummy path used by _path_safe to detect sub-directories
22_path_safe_root = os.path.realpath('/root/dir')
23# Aggregator separator. ?col|SUM treats SUM as an aggregate function
24_agg_sep = '|'
25# List of aggregated types returned by operators (if different from column type)
26# Note: For aggregation functions, see:
27# SQLite: https://www.sqlite.org/lang_aggfunc.html
28# MySQL: https://dev.mysql.com/doc/refman/8.0/en/group-by-functions.html
29# PostgreSQL: https://www.postgresql.org/docs/9.5/static/functions-aggregate.html
30# SQL Server: http://bit.ly/2MYPgQi
31# DB2: https://ibm.co/2Kfbnjw
32# Oracle: https://docs.oracle.com/database/121/SQLRF/functions003.htm
33_agg_type = {
34 'sum': float,
35 'count': int,
36 'avg': float,
37 'stdev': float, # MS SQL version of stddev
38 'stddev': float,
39 'rank': int,
40 'percent_rank': float,
41 # The following types are the same as the columns
42 # first, last, min, max, median
43}
44# List of Python types returned by SQLAlchemy
45_numeric_types = {'int', 'long', 'float', 'Decimal'}
48def _transform_fn(transform, transform_kwargs):
49 if transform is not None and transform_kwargs is not None:
50 return lambda v: transform(v, **transform_kwargs)
51 return transform
54def _replace(engine, args, *vars, **kwargs):
55 escape = _sql_safe if engine == 'sqlalchemy' else _path_safe
56 params = {k: v[0] for k, v in args.items() if len(v) > 0 and escape(v[0])}
58 def _format(val):
59 if isinstance(val, six.string_types):
60 return val.format(**params)
61 if isinstance(val, list):
62 return [_format(v) for v in val]
63 if isinstance(val, dict):
64 return AttrDict([(k, _format(v)) for k, v in val.items()])
65 return val
67 return _format(list(vars)) + [_format(kwargs)]
70def filter(url, args={}, meta={}, engine=None, table=None, ext=None,
71 query=None, queryfile=None, transform=None, transform_kwargs=None, **kwargs):
72 '''
73 Filters data using URL query parameters. Typical usage::
75 filtered = gramex.data.filter(dataframe, args=handler.args)
76 filtered = gramex.data.filter('file.csv', args=handler.args)
77 filtered = gramex.data.filter('mysql://server/db', table='table', args=handler.args)
79 It accepts the following parameters:
81 :arg source url: Pandas DataFrame, sqlalchemy URL, directory or file name,
82 http(s) data file, all `.format``-ed using ``args``.
83 :arg dict args: URL query parameters as a dict of lists. Pass handler.args or parse_qs results
84 :arg dict meta: this dict is updated with metadata during the course of filtering
85 :arg str engine: over-rides the auto-detected engine. Can be 'dataframe', 'file',
86 'http', 'https', 'sqlalchemy', 'dir'
87 :arg str table: table name (if url is an SQLAlchemy URL), ``.format``-ed
88 using ``args``.
89 :arg str ext: file extension (if url is a file). Defaults to url extension
90 :arg str query: optional SQL query to execute (if url is a database),
91 ``.format``-ed using ``args`` and supports SQLAlchemy SQL parameters.
92 Loads entire result in memory before filtering.
93 :arg str queryfile: optional SQL query file to execute (if url is a database).
94 Same as specifying the ``query:`` in a file. Overrides ``query:``
95 :arg function transform: optional in-memory transform of source data. Takes
96 the result of gramex.cache.open or gramex.cache.query. Must return a
97 DataFrame. Applied to both file and SQLAlchemy urls.
98 :arg dict transform_kwargs: optional keyword arguments to be passed to the
99 transform function -- apart from data
100 :arg dict kwargs: Additional parameters are passed to
101 :py:func:`gramex.cache.open` or ``sqlalchemy.create_engine``
102 :return: a filtered DataFrame
104 Remaining kwargs are passed to :py:func:`gramex.cache.open` if ``url`` is a file, or
105 ``sqlalchemy.create_engine`` if ``url`` is a SQLAlchemy URL.
107 If this is used in a handler as::
109 filtered = gramex.data.filter(dataframe, args=handler.args)
111 ... then calling the handler with ``?x=1&y=2`` returns all rows in
112 ``dataframe`` where x is 1 and y is 2.
114 If a table or query is passed to an SQLAlchemy url, it is formatted using
115 ``args``. For example::
117 data = gramex.data.filter('mysql://server/db', table='{xxx}', args=handler.args)
119 ... when passed ``?xxx=sales`` returns rows from the sales table. Similarly::
121 data = gramex.data.filter('mysql://server/db', args=handler.args,
122 query='SELECT {col}, COUNT(*) FROM table GROUP BY {col}')
124 ... when passsed ``?col=City`` replaces ``{col}`` with ``City``.
126 **NOTE**: To avoid SQL injection attacks, only values without spaces are
127 allowed. So ``?col=City Name`` or ``?col=City+Name`` **will not** work.
129 The URL supports operators filter like this:
131 - ``?x`` selects x is not null
132 - ``?x!`` selects x is null
133 - ``?x=val`` selects x == val
134 - ``?x!=val`` selects x != val
135 - ``?x>=val`` selects x > val
136 - ``?x>~=val`` selects x >= val
137 - ``?x<=val`` selects x < val
138 - ``?x<~=val`` selects x <= val
139 - ``?x~=val`` selects x matches val as a regular expression
140 - ``?x!~=val`` selects x does not match val as a regular expression
142 Multiple filters are combined into an AND clause. Ranges can also be
143 specified like this:
145 - ``?x=a&y=b`` selects x = a AND y = b
146 - ``?x>=100&x<=200`` selects x > 100 AND x < 200
148 If the same column has multiple values, they are combined like this:
150 - ``?x=a&x=b`` selects x IN (a, b)
151 - ``?x!=a&x!=b`` selects x NOT IN (a, b)
152 - ``?x~=a&x~=b`` selects x ~ a|b
153 - ``?x>=a&x>=b`` selects x > MIN(a, b)
154 - ``?x<=a&x<=b`` selects x < MAX(a, b)
156 Arguments are converted to the type of the column before comparing. If this
157 fails, it raises a ValueError.
159 These URL query parameters control the output:
161 - ``?_sort=col`` sorts column col in ascending order. ``?_sort=-col`` sorts
162 in descending order.
163 - ``?_limit=100`` limits the result to 100 rows
164 - ``?_offset=100`` starts showing the result from row 100. Default: 0
165 - ``?_c=x&_c=y`` returns only columns ``[x, y]``. ``?_c=-col`` drops col.
167 If a column name matches one of the above, you cannot filter by that column.
168 Avoid column names beginning with _.
170 To get additional information about the filtering, use::
172 meta = {} # Create a variable which will be filled with more info
173 filtered = gramex.data.filter(data, meta=meta, **handler.args)
175 The ``meta`` variable is populated with the following keys:
177 - ``filters``: Applied filters as ``[(col, op, val), ...]``
178 - ``ignored``: Ignored filters as ``[(col, vals), ('_sort', col), ('_by', col), ...]``
179 - ``excluded``: Excluded columns as ``[col, ...]``
180 - ``sort``: Sorted columns as ``[(col, True), ...]``. The second parameter is ``ascending=``
181 - ``offset``: Offset as integer. Defaults to 0
182 - ``limit``: Limit as integer - ``None`` if limit is not applied
183 - ``count``: Total number of rows, if available
184 - ``by``: Group by columns as ``[col, ...]``
186 These variables may be useful to show additional information about the
187 filtered data.
188 '''
189 # Auto-detect engine.
190 if engine is None:
191 engine = get_engine(url)
193 # Pass the meta= argument from kwargs (if any)
194 meta.update({
195 'filters': [], # Applied filters as [(col, op, val), ...]
196 'ignored': [], # Ignored filters as [(col, vals), ...]
197 'sort': [], # Sorted columns as [(col, asc), ...]
198 'offset': 0, # Offset as integer
199 'limit': None, # Limit as integer - None if not applied
200 'by': [], # Group by columns as [col, ...]
201 })
202 controls = _pop_controls(args)
203 transform = _transform_fn(transform, transform_kwargs)
204 url, table, ext, query, queryfile, kwargs = _replace(
205 engine, args, url, table, ext, query, queryfile, **kwargs)
207 # Use the appropriate filter function based on the engine
208 if engine == 'dataframe':
209 data = transform(url) if callable(transform) else url
210 return _filter_frame(data, meta=meta, controls=controls, args=args)
211 elif engine == 'dir':
212 data = dirstat(url, **args)
213 data = transform(data) if callable(transform) else data
214 return _filter_frame(data, meta=meta, controls=controls, args=args)
215 elif engine in {'file', 'http', 'https'}:
216 if engine == 'file' and not os.path.exists(url):
217 raise OSError('url: %s not found' % url)
218 # Get the full dataset. Then filter it
219 data = gramex.cache.open(url, ext, transform=transform, **kwargs)
220 return _filter_frame(data, meta=meta, controls=controls, args=args)
221 elif engine == 'sqlalchemy':
222 engine = create_engine(url, **kwargs)
223 if query or queryfile:
224 if queryfile:
225 query = gramex.cache.open(queryfile, 'text')
226 state = None
227 if isinstance(table, six.string_types):
228 state = table if ' ' in table else [table]
229 elif isinstance(table, (list, tuple)):
230 state = [t for t in table]
231 elif table is not None:
232 raise ValueError('table: must be string or list of strings, not %r' % table)
233 all_params = {k: v[0] for k, v in args.items() if len(v) > 0}
234 data = gramex.cache.query(text(query), engine, state, params=all_params)
235 data = transform(data) if callable(transform) else data
236 return _filter_frame(data, meta=meta, controls=controls, args=args)
237 elif table: 237 ↛ 244line 237 didn't jump to line 244, because the condition on line 237 was never false
238 if callable(transform):
239 data = gramex.cache.query(table, engine, [table])
240 return _filter_frame(transform(data), meta=meta, controls=controls, args=args)
241 else:
242 return _filter_db(engine, table, meta=meta, controls=controls, args=args)
243 else:
244 raise ValueError('No table: or query: specified')
245 else:
246 raise ValueError('engine: %s invalid. Can be sqlalchemy|file|dataframe' % engine)
249def delete(url, meta={}, args=None, engine=None, table=None, ext=None, id=None,
250 query=None, queryfile=None, transform=None, transform_kwargs={}, **kwargs):
251 '''
252 Deletes data using URL query parameters. Typical usage::
254 count = gramex.data.delete(dataframe, args=handler.args, id=['id'])
255 count = gramex.data.delete('file.csv', args=handler.args, id=['id'])
256 count = gramex.data.delete('mysql://server/db', table='table', args=handler.args, id='id')
258 ``id`` is a column name or a list of column names defining the primary key.
259 Calling this in a handler with ``?id=1&id=2`` deletes rows with id is 1 or 2.
261 It accepts the same parameters as :py:func:`filter`, and returns the number
262 of deleted rows.
263 '''
264 if engine is None: 264 ↛ 266line 264 didn't jump to line 266, because the condition on line 264 was never false
265 engine = get_engine(url)
266 meta.update({'filters': [], 'ignored': []})
267 controls = _pop_controls(args)
268 url, table, ext, query, queryfile, kwargs = _replace(
269 engine, args, url, table, ext, query, queryfile, **kwargs)
270 if engine == 'dataframe': 270 ↛ 271line 270 didn't jump to line 271, because the condition on line 270 was never true
271 data_filtered = _filter_frame(url, meta=meta, controls=controls,
272 args=args, source='delete', id=id)
273 return len(data_filtered)
274 elif engine == 'file':
275 data = gramex.cache.open(url, ext, transform=transform, **kwargs)
276 data_filtered = _filter_frame(data, meta=meta, controls=controls,
277 args=args, source='delete', id=id)
278 gramex.cache.save(data, url, ext, index=False, **kwargs)
279 return len(data_filtered)
280 elif engine == 'sqlalchemy': 280 ↛ 287line 280 didn't jump to line 287, because the condition on line 280 was never false
281 if table is None: 281 ↛ 282line 281 didn't jump to line 282, because the condition on line 281 was never true
282 raise ValueError('No table: specified')
283 engine = create_engine(url, **kwargs)
284 return _filter_db(engine, table, meta=meta, controls=controls, args=args,
285 source='delete', id=id)
286 else:
287 raise ValueError('engine: %s invalid. Can be sqlalchemy|file|dataframe' % engine)
288 return 0
291def update(url, meta={}, args=None, engine=None, table=None, ext=None, id=None,
292 query=None, queryfile=None, transform=None, transform_kwargs={}, **kwargs):
293 '''
294 Update data using URL query parameters. Typical usage::
296 count = gramex.data.update(dataframe, args=handler.args, id=['id'])
297 count = gramex.data.update('file.csv', args=handler.args, id=['id'])
298 count = gramex.data.update('mysql://server/db', table='table', args=handler.args, id='id')
300 ``id`` is a column name or a list of column names defining the primary key.
301 Calling this in a handler with ``?id=1&x=2`` updates x=2 where id=1.
303 It accepts the same parameters as :py:func:`filter`, and returns the number of updated rows.
304 '''
305 if engine is None: 305 ↛ 307line 305 didn't jump to line 307, because the condition on line 305 was never false
306 engine = get_engine(url)
307 meta.update({'filters': [], 'ignored': []})
308 controls = _pop_controls(args)
309 url, table, ext, query, queryfile, kwargs = _replace(
310 engine, args, url, table, ext, query, queryfile, **kwargs)
311 if engine == 'dataframe':
312 data_updated = _filter_frame(
313 url, meta=meta, controls=controls, args=args, source='update', id=id)
314 return len(data_updated)
315 elif engine == 'file':
316 data = gramex.cache.open(url, ext, transform=transform, **kwargs)
317 data_updated = _filter_frame(
318 data, meta=meta, controls=controls, args=args, source='update', id=id)
319 gramex.cache.save(data, url, ext, index=False, **kwargs)
320 return len(data_updated)
321 elif engine == 'sqlalchemy': 321 ↛ 328line 321 didn't jump to line 328, because the condition on line 321 was never false
322 if table is None: 322 ↛ 323line 322 didn't jump to line 323, because the condition on line 322 was never true
323 raise ValueError('No table: specified')
324 engine = create_engine(url, **kwargs)
325 return _filter_db(engine, table, meta=meta, controls=controls, args=args,
326 source='update', id=id)
327 else:
328 raise ValueError('engine: %s invalid. Can be sqlalchemy|file|dataframe' % engine)
329 return 0
332def insert(url, meta={}, args=None, engine=None, table=None, ext=None, id=None,
333 query=None, queryfile=None, transform=None, transform_kwargs={}, **kwargs):
334 '''
335 Insert data using URL query parameters. Typical usage::
337 count = gramex.data.insert(dataframe, args=handler.args, id=['id'])
338 count = gramex.data.insert('file.csv', args=handler.args, id=['id'])
339 count = gramex.data.insert('mysql://server/db', table='table', args=handler.args, id='id')
341 ``id`` is a column name or a list of column names defining the primary key.
342 Calling this in a handler with ``?id=3&x=2`` inserts a new record with id=3 and x=2.
344 If the target file / table does not exist, it is created.
346 It accepts the same parameters as :py:func:`filter`, and returns the number of updated rows.
347 '''
348 if engine is None: 348 ↛ 350line 348 didn't jump to line 350, because the condition on line 348 was never false
349 engine = get_engine(url)
350 _pop_controls(args)
351 meta.update({'filters': [], 'ignored': []})
352 # If values do not have equal number of elements, pad them and warn
353 rowcount = max(len(val) for val in args.values())
354 for key, val in args.items():
355 rows = len(val)
356 if 0 < rows < rowcount:
357 val += [val[-1]] * (rowcount - rows)
358 app_log.warning('data.insert: column %s has %d rows not %d. Extended last value %s',
359 key, rows, rowcount, val[-1])
360 rows = pd.DataFrame.from_dict(args)
361 url, table, ext, query, queryfile, kwargs = _replace(
362 engine, args, url, table, ext, query, queryfile, **kwargs)
363 if engine == 'dataframe': 363 ↛ 364line 363 didn't jump to line 364, because the condition on line 363 was never true
364 rows = _pop_columns(rows, url.columns, meta['ignored'])
365 url = url.append(rows, sort=False)
366 return len(rows)
367 elif engine == 'file':
368 try:
369 data = gramex.cache.open(url, ext, transform=None, **kwargs)
370 except (OSError, IOError):
371 data = rows
372 else:
373 rows = _pop_columns(rows, data.columns, meta['ignored'])
374 data = data.append(rows, sort=False)
375 gramex.cache.save(data, url, ext, index=False, **kwargs)
376 return len(rows)
377 elif engine == 'sqlalchemy': 377 ↛ 395line 377 didn't jump to line 395, because the condition on line 377 was never false
378 if table is None: 378 ↛ 379line 378 didn't jump to line 379, because the condition on line 378 was never true
379 raise ValueError('No table: specified')
380 engine = create_engine(url, **kwargs)
381 try:
382 cols = get_table(engine, table).columns
383 except sqlalchemy.exc.NoSuchTableError:
384 pass
385 else:
386 rows = _pop_columns(rows, [col.name for col in cols], meta['ignored'])
387 if '.' in table: 387 ↛ 388line 387 didn't jump to line 388, because the condition on line 387 was never true
388 kwargs['schema'], table = table.rsplit('.', 1)
389 # pandas does not document engine.dialect.has_table so it might change.
390 if not engine.dialect.has_table(engine, table) and id:
391 engine.execute(pd.io.sql.get_schema(rows, name=table, keys=id, con=engine))
392 rows.to_sql(table, engine, if_exists='append', index=False, **kwargs)
393 return len(rows)
394 else:
395 raise ValueError('engine: %s invalid. Can be sqlalchemy|file|dataframe' % engine)
396 return 0
399def get_engine(url):
400 '''
401 Used to detect type of url passed. Returns:
403 - ``'dataframe'`` if url is a Pandas DataFrame
404 - ``'sqlalchemy'`` if url is a sqlalchemy compatible URL
405 - ``protocol`` if url is of the form `protocol://...`
406 - ``'dir'`` if it is not a URL but a valid directory
407 - ``'file'`` if it is not a URL but a valid file
409 Else it raises an Exception
410 '''
411 if isinstance(url, pd.DataFrame):
412 return 'dataframe'
413 try:
414 url = sqlalchemy.engine.url.make_url(url)
415 except sqlalchemy.exc.ArgumentError:
416 return 'dir' if os.path.isdir(url) else 'file'
417 try:
418 url.get_driver_name()
419 return 'sqlalchemy'
420 except sqlalchemy.exc.NoSuchModuleError:
421 return url.drivername
424def create_engine(url, **kwargs):
425 '''
426 Cached version of sqlalchemy.create_engine.
428 Normally, this is not required. But :py:func:`get_table` caches the engine
429 *and* metadata *and* uses autoload=True. This makes sqlalchemy create a new
430 database connection for every engine object, and not dispose it. So we
431 re-use the engine objects within this module.
432 '''
433 if url not in _ENGINE_CACHE:
434 _ENGINE_CACHE[url] = sqlalchemy.create_engine(url, **kwargs)
435 return _ENGINE_CACHE[url]
438def get_table(engine, table):
439 '''Return the sqlalchemy table from the engine and table name'''
440 if engine not in _METADATA_CACHE:
441 _METADATA_CACHE[engine] = sqlalchemy.MetaData()
442 metadata = _METADATA_CACHE[engine]
443 if '.' in table:
444 schema, tbl = table.rsplit('.', 1)
445 return sqlalchemy.Table(tbl, metadata, autoload=True, autoload_with=engine, schema=schema)
446 else:
447 return sqlalchemy.Table(table, metadata, autoload=True, autoload_with=engine)
450def _pop_controls(args):
451 '''Filter out data controls: sort, limit, offset and column (_c) from args'''
452 return {
453 key: args.pop(key)
454 for key in ('_sort', '_limit', '_offset', '_c', '_by')
455 if key in args
456 }
459def _pop_columns(data, cols, ignored):
460 '''Remove columns not in cols'''
461 cols = set(cols)
462 for col in data.columns:
463 if col not in cols:
464 ignored.append([col, data[col].tolist()])
465 return data[[col for col in cols if col in data.columns]]
468def _sql_safe(val):
469 '''Return True if val is safe for insertion in an SQL query'''
470 if isinstance(val, six.string_types):
471 return not re.search(r'\s', val)
472 elif isinstance(val, six.integer_types) or isinstance(val, (float, bool)):
473 return True
474 return False
477def _path_safe(path):
478 '''Returns True if path does not try to escape outside a given directory using .. or / etc'''
479 # Ignore non-strings. These are generally not meant for paths
480 if not isinstance(path, six.string_types):
481 return True
482 return os.path.realpath(os.path.join(_path_safe_root, path)).startswith(_path_safe_root)
485# The order of operators is important. ~ is at the end. Otherwise, !~
486# or >~ will also be mapped to ~ as an operator
487operators = ['!', '>', '>~', '<', '<~', '!~', '~']
490def _filter_col(col, cols):
491 '''
492 Parses a column name from a list of columns and returns a (col, agg, op)
493 tuple.
495 - ``col`` is the name of the column in cols.
496 - ``agg`` is the aggregation operation (SUM, MIN, MAX, etc), else None
497 - ``op`` is the operator ('', !, >, <, etc)
499 If the column is invalid, then ``col`` and ``op`` are None
500 '''
501 colset = set(cols)
502 # ?col= is returned quickly
503 if col in colset:
504 return col, None, ''
505 # Check if it matches a non-empty operator, like ?col>~=
506 for op in operators:
507 if col.endswith(op):
508 name = col[:-len(op)]
509 if name in colset:
510 return name, None, op
511 # If there's an aggregator, split it out, like ?col|SUM>~=
512 elif _agg_sep in name:
513 name, agg = name.rsplit(_agg_sep, 1)
514 if name in colset: 514 ↛ 506line 514 didn't jump to line 506, because the condition on line 514 was never false
515 return name, agg, op
516 # If no operators match, it might be a pure aggregation, like ?col|SUM=
517 if _agg_sep in col:
518 name, agg = col.rsplit(_agg_sep, 1)
519 if name in colset: 519 ↛ 522line 519 didn't jump to line 522, because the condition on line 519 was never false
520 return name, agg, ''
521 # Otherwise we don't know what it is
522 return None, None, None
525def _filter_frame_col(data, key, col, op, vals, meta):
526 # Apply type conversion for values
527 conv = data[col].dtype.type
528 vals = tuple(conv(val) for val in vals if val)
529 if op not in {'', '!'} and len(vals) == 0:
530 meta['ignored'].append((key, vals))
531 elif op == '':
532 data = data[data[col].isin(vals)] if len(vals) else data[pd.notnull(data[col])]
533 elif op == '!':
534 data = data[~data[col].isin(vals)] if len(vals) else data[pd.isnull(data[col])]
535 elif op == '>':
536 data = data[data[col] > min(vals)]
537 elif op == '>~':
538 data = data[data[col] >= min(vals)]
539 elif op == '<':
540 data = data[data[col] < max(vals)]
541 elif op == '<~':
542 data = data[data[col] <= max(vals)]
543 elif op == '!~':
544 data = data[~data[col].str.contains('|'.join(vals))]
545 elif op == '~': 545 ↛ 547line 545 didn't jump to line 547, because the condition on line 545 was never false
546 data = data[data[col].str.contains('|'.join(vals))]
547 meta['filters'].append((col, op, vals))
548 return data
551def _filter_db_col(query, method, key, col, op, vals, column, conv, meta):
552 '''
553 - Updates ``query`` with a method (WHERE/HAVING) that sets '<key> <op> <vals>'
554 - ``column`` is the underlying ColumnElement
555 - ``conv`` is a type conversion function that converts ``vals`` to the correct type
556 - Updates ``meta`` with the fields used for filtering (or ignored)
557 '''
558 # In PY2, .python_type returns str. We want unicode
559 sql_types = {six.binary_type: six.text_type, pd.datetime: six.text_type}
560 conv = sql_types.get(conv, conv)
561 vals = tuple(conv(val) for val in vals if val)
562 if op not in {'', '!'} and len(vals) == 0:
563 meta['ignored'].append((key, vals))
564 elif op == '':
565 # Test if column is not NULL. != None is NOT the same as is not None
566 query = method(column.in_(vals) if len(vals) else column != None) # noqa
567 elif op == '!':
568 # Test if column is NULL. == None is NOT the same as is None
569 query = method(column.notin_(vals) if len(vals) else column == None) # noqa
570 elif op == '>':
571 query = method(column > min(vals))
572 elif op == '>~':
573 query = method(column >= min(vals))
574 elif op == '<':
575 query = method(column < max(vals))
576 elif op == '<~':
577 query = method(column <= max(vals))
578 elif op == '!~':
579 query = method(column.notlike('%' + '%'.join(vals) + '%'))
580 elif op == '~': 580 ↛ 582line 580 didn't jump to line 582, because the condition on line 580 was never false
581 query = method(column.like('%' + '%'.join(vals) + '%'))
582 meta['filters'].append((col, op, vals))
583 return query
586def _filter_sort_columns(sort_filter, cols):
587 sorts, ignore_sorts = [], []
588 for col in sort_filter:
589 if col in cols:
590 sorts.append((col, True))
591 elif col.startswith('-') and col[1:] in cols:
592 sorts.append((col[1:], False))
593 else:
594 ignore_sorts.append(col)
595 return sorts, ignore_sorts
598def _filter_select_columns(col_filter, cols, meta):
599 '''
600 Checks ?_c=col&_c=-col for filter(). Takes values of ?_c= as col_filter and
601 data column names as cols. Returns 2 lists: show_cols as columns to show.
602 ignored_cols has column names not in the list, i.e. the ?_c= parameters that
603 are ignored.
604 '''
605 selected_cols, excluded_cols, ignored_cols = [], set(), []
606 for col in col_filter:
607 if col in cols:
608 selected_cols.append(col)
609 elif col.startswith('-') and col[1:] in cols:
610 excluded_cols.add(col[1:])
611 else:
612 ignored_cols.append(col)
613 if len(excluded_cols) > 0 and len(selected_cols) == 0:
614 selected_cols = cols
615 show_cols = [col for col in selected_cols if col not in excluded_cols]
616 meta['excluded'] = list(excluded_cols)
617 return show_cols, ignored_cols
620def _filter_groupby_columns(by, cols, meta):
621 '''
622 Checks ?_by=col&_by=col for filter().
624 - ``by``: list of column names to group by
625 - ``cols``: list of valid column names
626 - ``meta``: meta['by'] and meta['ignored'] are updated
628 Returns a list of columns to group by
629 '''
630 colset = set(cols)
631 for col in by:
632 if col in colset:
633 meta['by'].append(col)
634 else:
635 meta['ignored'].append(('_by', col))
636 return meta['by']
639# If ?by=col|avg is provided, this works in SQL but not in Pandas DataFrames.
640# Convert into a DataFrame friendly function
641_frame_functions = {
642 'avg': 'mean',
643 'average': 'mean',
644}
647def _filter_frame(data, meta, controls, args, source='select', id=[]):
648 '''
649 If ``source`` is ``'select'``, returns a DataFrame in which the DataFrame
650 ``data`` is filtered using ``args``. Additional controls like _sort, etc are
651 in ``controls``. Metadata is stored in ``meta``.
653 If ``source`` is ``'update'``, filters using ``args`` but only for columns
654 mentioned in ``id``. Resulting DataFrame is updated with remaining ``args``.
655 Returns the updated rows.
657 If ``source`` is ``'delete'``, filters using ``args`` but only for columns
658 mentioned in ``id``. Deletes these rows. Returns the deleted rows.
660 :arg data: dataframe
661 :arg meta: dictionary of `filters`, `ignored`, `sort`, `offset`, `limit` params from kwargs
662 :arg args: user arguments to filter the data
663 :arg source: accepted values - `update`, `delete` for PUT, DELETE methods in FormHandler
664 :arg id: list of id specific to data using which values can be updated
665 '''
666 original_data = data
667 cols_for_update = {}
668 cols_having = []
669 for key, vals in args.items():
670 # check if `key`` is in the `id` list -- ONLY when data is updated
671 if (source in ('update', 'delete') and key in id) or (source == 'select'):
672 # Parse column names, ignoring missing / unmatched columns
673 col, agg, op = _filter_col(key, data.columns)
674 if col is None:
675 meta['ignored'].append((key, vals))
676 continue
677 # Process aggregated columns AFTER filtering, not before (like HAVING clause)
678 # e.g. ?sales|SUM=<val> should be applied only after the column is created
679 if agg is not None:
680 cols_having.append((key, col + _agg_sep + agg, op, vals))
681 continue
682 # Apply filters
683 data = _filter_frame_col(data, key, col, op, vals, meta)
684 elif source == 'update':
685 # Update values should only contain 1 value. 2nd onwards are ignored
686 if key not in data.columns or len(vals) == 0:
687 meta['ignored'].append((key, vals))
688 else:
689 cols_for_update[key] = vals[0]
690 if len(vals) > 1: 690 ↛ 691line 690 didn't jump to line 691, because the condition on line 690 was never true
691 meta['ignored'].append((key, vals[1:]))
692 else:
693 meta['ignored'].append((key, vals))
694 meta['count'] = len(data)
695 if source == 'delete':
696 original_data.drop(data.index, inplace=True)
697 return data
698 elif source == 'update':
699 conv = {k: v.type for k, v in data.dtypes.items()}
700 for key, val in cols_for_update.items():
701 original_data.loc[data.index, key] = conv[key](val)
702 return data
703 else:
704 # Apply controls
705 if '_by' in controls:
706 by = _filter_groupby_columns(controls['_by'], data.columns, meta)
707 # If ?_c is not specified, use 'col|sum' for all numeric columns
708 # TODO: This does not support ?_c=-<col> to hide a column
709 col_list = controls.get('_c', None)
710 if col_list is None:
711 col_list = [col + _agg_sep + 'sum' for col in data.columns # noqa
712 if pd.api.types.is_numeric_dtype(data[col])]
713 agg_cols = []
714 agg_dict = AttrDict()
715 for key in col_list:
716 col, agg, val = _filter_col(key, data.columns)
717 if agg is not None:
718 # Convert aggregation into a Pandas GroupBy agg function
719 agg = agg.lower()
720 agg = _frame_functions.get(agg, agg)
721 agg_cols.append(key)
722 if col in agg_dict:
723 agg_dict[col].append(agg)
724 else:
725 agg_dict[col] = [agg]
726 if len(by) > 0:
727 if not agg_cols:
728 # If no aggregation columns exist, just show groupby columns.
729 data = data.groupby(by).agg('size').reset_index()
730 data = data.iloc[:, [0]]
731 else:
732 data = data.groupby(by).agg(agg_dict)
733 data.columns = agg_cols
734 data = data.reset_index()
735 # Apply HAVING operators
736 for key, col, op, vals in cols_having:
737 data = _filter_frame_col(data, key, col, op, vals, meta)
738 else:
739 row = [data[col].agg(op) for col, ops in agg_dict.items() for op in ops]
740 data = pd.DataFrame([row], columns=agg_cols)
741 elif '_c' in controls:
742 show_cols, hide_cols = _filter_select_columns(controls['_c'], data.columns, meta)
743 data = data[show_cols]
744 if len(hide_cols) > 0:
745 meta['ignored'].append(('_c', hide_cols))
746 if '_sort' in controls:
747 meta['sort'], ignore_sorts = _filter_sort_columns(controls['_sort'], data.columns)
748 if len(meta['sort']) > 0:
749 data = data.sort_values(by=[c[0] for c in meta['sort']],
750 ascending=[c[1] for c in meta['sort']])
751 if len(ignore_sorts) > 0:
752 meta['ignored'].append(('_sort', ignore_sorts))
753 if '_offset' in controls:
754 try:
755 offset = min(int(v) for v in controls['_offset'])
756 except ValueError:
757 raise ValueError('_offset not integer: %r' % controls['_offset'])
758 data = data.iloc[offset:]
759 meta['offset'] = offset
760 if '_limit' in controls:
761 try:
762 limit = min(int(v) for v in controls['_limit'])
763 except ValueError:
764 raise ValueError('_limit not integer: %r' % controls['_limit'])
765 data = data.iloc[:limit]
766 meta['limit'] = limit
767 return data
770def _filter_db(engine, table, meta, controls, args, source='select', id=[]):
771 '''
773 It accepts the following parameters
775 :arg sqlalchemy engine engine: constructed sqlalchemy string
776 :arg database table table: table name in the mentioned database
777 :arg controls: dictionary of `_sort`, `_c`, `_offset`, `_limit` params
778 :arg meta: dictionary of `filters`, `ignored`, `sort`, `offset`, `limit` params from kwargs
779 :arg args: dictionary of user arguments to filter the data
780 :arg source: accepted values - `update`, `delete` for PUT, DELETE methods in FormHandler
781 :arg id: list of keys specific to data using which values can be updated
782 '''
783 table = get_table(engine, table)
784 cols = table.columns
785 colslist = cols.keys()
787 if source == 'delete':
788 query = sqlalchemy.delete(table)
789 elif source == 'update':
790 query = sqlalchemy.update(table)
791 else:
792 query = sqlalchemy.select([table])
793 cols_for_update = {}
794 cols_having = []
795 for key, vals in args.items():
796 # check if `key`` is in the `id` list -- ONLY when data is updated
797 if (source in ('update', 'delete') and key in id) or (source == 'select'):
798 # Parse column names, ignoring missing / unmatched columns
799 col, agg, op = _filter_col(key, colslist)
800 if col is None:
801 meta['ignored'].append((key, vals))
802 continue
803 # Process aggregated columns AFTER filtering, not before (like HAVING clause)
804 # e.g. ?sales|SUM=<val> should be applied only after the column is created
805 if agg is not None:
806 cols_having.append((key, col + _agg_sep + agg, op, vals))
807 continue
808 # Apply filters
809 query = _filter_db_col(query, query.where, key, col, op, vals,
810 cols[col], cols[col].type.python_type, meta)
811 elif source == 'update':
812 # Update values should only contain 1 value. 2nd onwards are ignored
813 if key not in cols or len(vals) == 0:
814 meta['ignored'].append((key, vals))
815 else:
816 cols_for_update[key] = vals[0]
817 if len(vals) > 1: 817 ↛ 818line 817 didn't jump to line 818, because the condition on line 817 was never true
818 meta['ignored'].append((key, vals[1:]))
819 else:
820 meta['ignored'].append((key, vals))
821 if source == 'delete':
822 res = engine.execute(query)
823 return res.rowcount
824 elif source == 'update':
825 query = query.values(cols_for_update)
826 res = engine.execute(query)
827 return res.rowcount
828 else:
829 # Apply controls
830 if '_by' in controls:
831 by = _filter_groupby_columns(controls['_by'], colslist, meta)
832 query = query.group_by(*by)
833 # If ?_c is not specified, use 'col|sum' for all numeric columns
834 # TODO: This does not support ?_c=-<col> to hide a column
835 col_list = controls.get('_c', None)
836 if col_list is None:
837 col_list = [col + _agg_sep + 'sum' for col, column in cols.items() # noqa
838 if column.type.python_type.__name__ in _numeric_types]
839 agg_cols = AttrDict([(col, cols[col]) for col in by]) # {label: ColumnElement}
840 typ = {} # {label: python type}
841 for key in col_list:
842 col, agg, val = _filter_col(key, colslist)
843 if agg is not None:
844 # Convert aggregation into SQLAlchemy query
845 agg = agg.lower()
846 typ[key] = _agg_type.get(agg, cols[col].type.python_type)
847 agg_func = getattr(sqlalchemy.sql.expression.func, agg)
848 agg_cols[key] = agg_func(cols[col]).label(key)
849 if not agg_cols:
850 return pd.DataFrame()
851 query = query.with_only_columns(agg_cols.values())
852 # Apply HAVING operators
853 for key, col, op, vals in cols_having:
854 query = _filter_db_col(query, query.having, key, col, op, vals,
855 agg_cols[col], typ[col], meta)
856 elif '_c' in controls:
857 show_cols, hide_cols = _filter_select_columns(controls['_c'], colslist, meta)
858 query = query.with_only_columns([cols[col] for col in show_cols])
859 if len(hide_cols) > 0:
860 meta['ignored'].append(('_c', hide_cols))
861 if len(show_cols) == 0: 861 ↛ 862line 861 didn't jump to line 862, because the condition on line 861 was never true
862 return pd.DataFrame()
863 if '_sort' in controls:
864 meta['sort'], ignore_sorts = _filter_sort_columns(
865 controls['_sort'], colslist + query.columns.keys())
866 for col, asc in meta['sort']:
867 orderby = sqlalchemy.asc if asc else sqlalchemy.desc
868 query = query.order_by(orderby(col))
869 if len(ignore_sorts) > 0:
870 meta['ignored'].append(('_sort', ignore_sorts))
871 if '_offset' in controls:
872 try:
873 offset = min(int(v) for v in controls['_offset'])
874 except ValueError:
875 raise ValueError('_offset not integer: %r' % controls['_offset'])
876 query = query.offset(offset)
877 meta['offset'] = offset
878 if '_limit' in controls:
879 try:
880 limit = min(int(v) for v in controls['_limit'])
881 except ValueError:
882 raise ValueError('_limit not integer: %r' % controls['_limit'])
883 query = query.limit(limit)
884 meta['limit'] = limit
885 return pd.read_sql(query, engine)
888_VEGA_SCRIPT = os.path.join(_FOLDER, 'download.vega.js')
891def download(data, format='json', template=None, args={}, **kwargs):
892 '''
893 Download a DataFrame or dict of DataFrames in various formats. This is used
894 by :py:class:`gramex.handlers.FormHandler`. You are **strongly** advised to
895 try it before creating your own FunctionHandler.
897 Usage as a FunctionHandler::
899 def download_as_csv(handler):
900 handler.set_header('Content-Type', 'text/csv')
901 handler.set_header('Content-Disposition', 'attachment;filename=data.csv')
902 return gramex.data.download(dataframe, format='csv')
904 It takes the following arguments:
906 :arg dataset data: A DataFrame or a dict of DataFrames
907 :arg str format: Output format. Can be ``csv|json|html|xlsx|template``
908 :arg file template: Path to template file for ``template`` format
909 :arg dict args: dictionary of user arguments to subsitute spec
910 :arg dict kwargs: Additional parameters that are passed to the relevant renderer
911 :return: bytes with the download file contents
913 When ``data`` is a DataFrame, this is what different ``format=`` parameters
914 return:
916 - ``csv`` returns a UTF-8-BOM encoded CSV file of the dataframe
917 - ``xlsx`` returns an Excel file with 1 sheet named ``data``. kwargs are
918 passed to ``.to_excel(index=False)``
919 - ``html`` returns a HTML file with a single table. kwargs are passed to
920 ``.to_html(index=False)``
921 - ``json`` returns a JSON file. kwargs are passed to
922 ``.to_json(orient='records', force_ascii=True)``.
923 - ``template`` returns a Tornado template rendered file. The template
924 receives ``data`` as ``data`` and any additional kwargs.
925 - ``pptx`` returns a PPTX generated by pptgen
926 - ``seaborn`` or ``sns`` returns a Seaborn generated chart
927 - ``vega`` returns JavaScript that renders a Vega chart
929 When ``data`` is a dict of DataFrames, the following additionally happens:
931 - ``format='csv'`` renders all DataFrames one below the other, adding the
932 key as heading
933 - ``format='xlsx'`` renders each DataFrame on a sheet whose name is the key
934 - ``format='html'`` renders tables below one another with the key as heading
935 - ``format='json'`` renders as a dict of DataFrame JSONs
936 - ``format='template'`` sends ``data`` and all ``kwargs`` as passed to the
937 template
938 - ``format='pptx'`` passes ``data`` as a dict of datasets to pptgen
939 - ``format='vega'`` passes ``data`` as a dict of datasets to Vega
941 You need to set the MIME types on the handler yourself. Recommended MIME
942 types are in gramex.yaml under handler.FormHandler.
943 '''
944 if isinstance(data, dict):
945 for key, val in data.items():
946 if not isinstance(val, pd.DataFrame): 946 ↛ 947line 946 didn't jump to line 947, because the condition on line 946 was never true
947 raise ValueError('download({"%s": %r}) invalid type' % (key, type(val)))
948 if not len(data): 948 ↛ 949line 948 didn't jump to line 949, because the condition on line 948 was never true
949 raise ValueError('download() data requires at least 1 DataFrame')
950 multiple = True
951 elif not isinstance(data, pd.DataFrame): 951 ↛ 952line 951 didn't jump to line 952, because the condition on line 951 was never true
952 raise ValueError('download(%r) invalid type' % type(data))
953 else:
954 data = {'data': data}
955 multiple = False
957 def kw(**conf):
958 return merge(kwargs, conf, mode='setdefault')
960 if format == 'csv':
961 # csv.writer requires BytesIO in PY2 and StringIO in PY3.
962 # I can't see an elegant way out of this other than writing code for each.
963 if six.PY2: 963 ↛ 964line 963 didn't jump to line 964, because the condition on line 963 was never true
964 out = io.BytesIO()
965 kw(index=False, encoding='utf-8')
966 for index, (key, val) in enumerate(data.items()):
967 if index > 0:
968 out.write(b'\n')
969 if multiple:
970 out.write(key.encode('utf-8') + b'\n')
971 val.to_csv(out, **kwargs)
972 result = out.getvalue()
973 # utf-8-sig encoding returns the result with a UTF-8 BOM. Easier to open in Excel
974 return ''.encode('utf-8-sig') + result if result.strip() else result
975 else:
976 out = io.StringIO()
977 kw(index=False)
978 for index, (key, val) in enumerate(data.items()):
979 if index > 0:
980 out.write('\n')
981 if multiple:
982 out.write(key + '\n')
983 val.to_csv(out, **kwargs)
984 result = out.getvalue()
985 # utf-8-sig encoding returns the result with a UTF-8 BOM. Easier to open in Excel
986 return result.encode('utf-8-sig') if result.strip() else result.encode('utf-8')
987 elif format == 'template': 987 ↛ 988line 987 didn't jump to line 988, because the condition on line 987 was never true
988 return gramex.cache.open(template, 'template').generate(
989 data=data if multiple else data['data'], **kwargs)
990 elif format == 'html':
991 out = io.StringIO()
992 kw(index=False)
993 for key, val in data.items():
994 if multiple:
995 out.write('<h1>%s</h1>' % key)
996 val.to_html(out, **kwargs)
997 return out.getvalue().encode('utf-8')
998 elif format in {'xlsx', 'xls'}:
999 out = io.BytesIO()
1000 kw(index=False)
1001 # TODO: Create and use a FrameWriter for formatting
1002 with pd.ExcelWriter(out, engine='xlsxwriter') as writer:
1003 for key, val in data.items():
1004 val.to_excel(writer, sheet_name=key, **kwargs)
1005 return out.getvalue()
1006 elif format in {'pptx', 'ppt'}: 1006 ↛ 1007line 1006 didn't jump to line 1007, because the condition on line 1006 was never true
1007 from gramex.pptgen import pptgen # noqa
1008 out = io.BytesIO()
1009 pptgen(target=out, data=data, is_formhandler=True, **kwargs)
1010 return out.getvalue()
1011 elif format in {'seaborn', 'sns'}:
1012 kw = AttrDict()
1013 defaults = {'chart': 'barplot', 'ext': 'png', 'data': 'data', 'dpi': 96,
1014 'width': 640, 'height': 480}
1015 for key, default in defaults.items():
1016 kw[key] = kwargs.pop(key, default)
1017 import matplotlib
1018 matplotlib.use('Agg') # Before importing seaborn, set a headless backend
1019 import seaborn as sns
1020 plot = getattr(sns, kw.chart)(data=data.get(kw.data), **kwargs)
1021 out = io.BytesIO()
1022 fig = plot.figure if hasattr(plot, 'figure') else plot.fig
1023 for k in ['dpi', 'width', 'height']:
1024 kw[k] = float(kw[k])
1025 fig.set_size_inches(kw.width / kw.dpi, kw.height / kw.dpi)
1026 fig.savefig(out, format=kw.ext, dpi=kw.dpi)
1027 fig.clear()
1028 return out.getvalue()
1029 elif format in {'vega', 'vega-lite', 'vegam'}:
1030 kwargs = kw(orient='records', force_ascii=True)
1031 spec = kwargs.pop('spec', {})
1032 kwargs.pop('handler', None)
1033 out = io.BytesIO()
1034 # conf = {..., spec: {..., data: __DATA__}}
1035 if isinstance(spec.get('data'), (dict, list)) or 'fromjson' in spec:
1036 # support only one dataset
1037 values = list(data.values())
1038 out.write(values[0].to_json(**kwargs).encode('utf-8'))
1039 out = out.getvalue()
1040 else:
1041 spec['data'] = '__DATA__'
1042 for index, (key, val) in enumerate(data.items()):
1043 out.write(b',{"name":' if index > 0 else b'{"name":')
1044 out.write(json_encode(key).encode('utf-8'))
1045 out.write(b',"values":')
1046 out.write(val.to_json(**kwargs).encode('utf-8'))
1047 out.write(b'}')
1048 out = out.getvalue()
1049 if format == 'vega':
1050 out = b'[' + out + b']'
1051 kwargs['spec'], _ = _replace('', args, spec)
1052 conf = json.dumps(kwargs, ensure_ascii=True, separators=(',', ':'), indent=None)
1053 conf = conf.encode('utf-8').replace(b'"__DATA__"', out)
1054 script = gramex.cache.open(_VEGA_SCRIPT, 'bin')
1055 return script.replace(b'/*{conf}*/', conf)
1056 else:
1057 out = io.BytesIO()
1058 kwargs = kw(orient='records', force_ascii=True)
1059 if multiple:
1060 out.write(b'{')
1061 for index, (key, val) in enumerate(data.items()):
1062 if index > 0:
1063 out.write(b',')
1064 out.write(json_encode(key).encode('utf-8'))
1065 out.write(b':')
1066 out.write(val.to_json(**kwargs).encode('utf-8'))
1067 out.write(b'}')
1068 else:
1069 out.write(data['data'].to_json(**kwargs).encode('utf-8'))
1070 return out.getvalue()
1073def dirstat(url, timeout=10, **kwargs):
1074 '''
1075 Return a DataFrame with the list of all files & directories under the url.
1077 It accepts the following parameters:
1079 :arg str url: path to a directory, or a URL like ``dir:///c:/path/``,
1080 ``dir:////root/dir/``. Raises ``OSError`` if url points to a missing
1081 location or is not a directory.
1082 :arg int timeout: max seconds to wait. ``None`` to wait forever. (default: 10)
1083 :return: a DataFrame with columns:
1084 - ``type``: extension with a ``.`` prefix -- or ``dir``
1085 - ``dir``: directory path to the file relative to the URL
1086 - ``name``: file name (including extension)
1087 - ``path``: full path to file or dir. This equals url / dir / name
1088 - ``size``: file size
1089 - ``mtime``: last modified time in seconds since epoch
1090 - ``level``: path depth (i.e. the number of paths in dir)
1091 '''
1092 try:
1093 url = sqlalchemy.engine.url.make_url(url)
1094 target = url.database
1095 except sqlalchemy.exc.ArgumentError:
1096 target = url
1097 if not os.path.isdir(target): 1097 ↛ 1098line 1097 didn't jump to line 1098, because the condition on line 1097 was never true
1098 raise OSError('dirstat: %s is not a directory' % target)
1099 target = os.path.normpath(target)
1100 result = []
1101 start_time = time.time()
1102 for dirpath, dirnames, filenames in os.walk(target):
1103 if timeout and time.time() - start_time > timeout: 1103 ↛ 1104line 1103 didn't jump to line 1104, because the condition on line 1103 was never true
1104 app_log.debug('dirstat: %s timeout (%.1fs)', url, timeout)
1105 break
1106 for name in dirnames:
1107 path = os.path.join(dirpath, name)
1108 stat = os.stat(path)
1109 dirname = dirpath.replace(target, '').replace(os.sep, '/') + '/'
1110 result.append({
1111 'path': path, 'dir': dirname, 'name': name, 'type': 'dir',
1112 'size': stat.st_size, 'mtime': stat.st_mtime, 'level': dirname.count('/'),
1113 })
1114 for name in filenames:
1115 path = os.path.join(dirpath, name)
1116 stat = os.stat(path)
1117 dirname = dirpath.replace(target, '').replace(os.sep, '/') + '/'
1118 result.append({
1119 'path': path, 'dir': dirname, 'name': name, 'type': os.path.splitext(name)[-1],
1120 'size': stat.st_size, 'mtime': stat.st_mtime, 'level': dirname.count('/'),
1121 })
1122 return pd.DataFrame(result)
1125def filtercols(url, args={}, meta={}, engine=None, table=None, ext=None,
1126 query=None, queryfile=None, transform=None, transform_kwargs={}, **kwargs):
1127 '''
1128 Filter data and extract unique values of each column using URL query parameters.
1129 Typical usage::
1131 filtered = gramex.data.filtercols(dataframe, args=handler.args)
1132 filtered = gramex.data.filtercols('file.csv', args=handler.args)
1133 filtered = gramex.data.filtercols('mysql://server/db', table='table', args=handler.args)
1135 It accepts the following parameters:
1137 :arg source url: Pandas DataFrame, sqlalchemy URL, directory or file name,
1138 `.format``-ed using ``args``.
1139 :arg dict args: URL query parameters as a dict of lists. Pass handler.args or parse_qs results
1140 :arg dict meta: this dict is updated with metadata during the course of filtering
1141 :arg str engine: over-rides the auto-detected engine. Can be 'dataframe', 'file',
1142 'http', 'https', 'sqlalchemy', 'dir'
1143 :arg str table: table name (if url is an SQLAlchemy URL), ``.format``-ed
1144 using ``args``.
1145 :arg str ext: file extension (if url is a file). Defaults to url extension
1146 :arg str query: optional SQL query to execute (if url is a database),
1147 ``.format``-ed using ``args`` and supports SQLAlchemy SQL parameters.
1148 Loads entire result in memory before filtering.
1149 :arg str queryfile: optional SQL query file to execute (if url is a database).
1150 Same as specifying the ``query:`` in a file. Overrides ``query:``
1151 :arg function transform: optional in-memory transform of source data. Takes
1152 the result of gramex.cache.open or gramex.cache.query. Must return a
1153 DataFrame. Applied to both file and SQLAlchemy urls.
1154 :arg dict transform_kwargs: optional keyword arguments to be passed to the
1155 transform function -- apart from data
1156 :arg dict kwargs: Additional parameters are passed to
1157 :py:func:`gramex.cache.open` or ``sqlalchemy.create_engine``
1158 :return: a filtered DataFrame
1160 Remaining kwargs are passed to :py:func:`gramex.cache.open` if ``url`` is a file, or
1161 ``sqlalchemy.create_engine`` if ``url`` is a SQLAlchemy URL.
1163 If this is used in a handler as::
1165 filtered = gramex.data.filtercols(dataframe, args=handler.args)
1167 ... then calling the handler with ``?_c=state&_c=district`` returns all unique values
1168 in columns of ``dataframe`` where columns are state and district.
1170 Column filter supports like this:
1172 - ``?_c=y&x`` returns df with unique values of y where x is not null
1173 - ``?_c=y&x=val`` returns df with unique values of y where x == val
1174 - ``?_c=y&y=val`` returns df with unique values of y, ignores filter y == val
1175 - ``?_c=y&x>=val`` returns df with unique values of y where x > val
1176 - ``?_c=x&_c=y&x=val`` returns df with unique values of x ignoring filter x == val
1177 and returns unique values of y where x == val
1179 Arguments are converted to the type of the column before comparing. If this
1180 fails, it raises a ValueError.
1182 These URL query parameters control the output:
1184 - ``?_sort=col`` sorts column col in ascending order. ``?_sort=-col`` sorts
1185 in descending order.
1186 - ``?_limit=100`` limits the result to 100 rows
1187 - ``?_offset=100`` starts showing the result from row 100. Default: 0
1188 - ``?_c=x&_c=y`` returns only columns ``[x, y]``. ``?_c=-col`` drops col.
1190 If a column name matches one of the above, you cannot filter by that column.
1191 Avoid column names beginning with _.
1193 To get additional information about the filtering, use::
1195 meta = {} # Create a variable which will be filled with more info
1196 filtered = gramex.data.filter(data, meta=meta, **handler.args)
1198 The ``meta`` variable is populated with the following keys:
1200 - ``filters``: Applied filters as ``[(col, op, val), ...]``
1201 - ``ignored``: Ignored filters as ``[(col, vals), ('_sort', cols), ...]``
1202 - ``excluded``: Excluded columns as ``[col, ...]``
1203 - ``sort``: Sorted columns as ``[(col, True), ...]``. The second parameter is ``ascending=``
1204 - ``offset``: Offset as integer. Defaults to 0
1205 - ``limit``: Limit as integer - ``100`` if limit is not applied
1206 - ``count``: Total number of rows, if available
1208 These variables may be useful to show additional information about the
1209 filtered data.
1210 '''
1211 # Auto-detect engine.
1212 if engine is None: 1212 ↛ 1214line 1212 didn't jump to line 1214, because the condition on line 1212 was never false
1213 engine = get_engine(url)
1214 result = {}
1215 limit = args.get('_limit', [100])
1216 try:
1217 limit = min(int(v) for v in limit)
1218 except ValueError:
1219 raise ValueError('_limit not integer: %r' % limit)
1220 for col in args.get('_c', []):
1221 # col_args takes _sort, _c and all filters from args
1222 col_args = {}
1223 for key, value in args.items():
1224 if key in ['_sort']:
1225 col_args[key] = value
1226 # Ignore any filters on the column we are currently processing
1227 if not key.startswith('_') and key != col:
1228 col_args[key] = value
1229 col_args['_by'] = [col]
1230 col_args['_c'] = []
1231 col_args['_limit'] = [limit]
1232 result[col] = gramex.data.filter(url, table=table, args=col_args, **kwargs)
1233 return result