Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1''' 

2Interact with data from the browser 

3''' 

4import io 

5import os 

6import re 

7import six 

8import time 

9import json 

10import sqlalchemy 

11import pandas as pd 

12import gramex.cache 

13from tornado.escape import json_encode 

14from sqlalchemy.sql import text 

15from gramex.config import merge, app_log 

16from orderedattrdict import AttrDict 

17 

18_ENGINE_CACHE = {} 

19_METADATA_CACHE = {} 

20_FOLDER = os.path.dirname(os.path.abspath(__file__)) 

21# Dummy path used by _path_safe to detect sub-directories 

22_path_safe_root = os.path.realpath('/root/dir') 

23# Aggregator separator. ?col|SUM treats SUM as an aggregate function 

24_agg_sep = '|' 

25# List of aggregated types returned by operators (if different from column type) 

26# Note: For aggregation functions, see: 

27# SQLite: https://www.sqlite.org/lang_aggfunc.html 

28# MySQL: https://dev.mysql.com/doc/refman/8.0/en/group-by-functions.html 

29# PostgreSQL: https://www.postgresql.org/docs/9.5/static/functions-aggregate.html 

30# SQL Server: http://bit.ly/2MYPgQi 

31# DB2: https://ibm.co/2Kfbnjw 

32# Oracle: https://docs.oracle.com/database/121/SQLRF/functions003.htm 

33_agg_type = { 

34 'sum': float, 

35 'count': int, 

36 'avg': float, 

37 'stdev': float, # MS SQL version of stddev 

38 'stddev': float, 

39 'rank': int, 

40 'percent_rank': float, 

41 # The following types are the same as the columns 

42 # first, last, min, max, median 

43} 

44# List of Python types returned by SQLAlchemy 

45_numeric_types = {'int', 'long', 'float', 'Decimal'} 

46 

47 

48def _transform_fn(transform, transform_kwargs): 

49 if transform is not None and transform_kwargs is not None: 

50 return lambda v: transform(v, **transform_kwargs) 

51 return transform 

52 

53 

54def _replace(engine, args, *vars, **kwargs): 

55 escape = _sql_safe if engine == 'sqlalchemy' else _path_safe 

56 params = {k: v[0] for k, v in args.items() if len(v) > 0 and escape(v[0])} 

57 

58 def _format(val): 

59 if isinstance(val, six.string_types): 

60 return val.format(**params) 

61 if isinstance(val, list): 

62 return [_format(v) for v in val] 

63 if isinstance(val, dict): 

64 return AttrDict([(k, _format(v)) for k, v in val.items()]) 

65 return val 

66 

67 return _format(list(vars)) + [_format(kwargs)] 

68 

69 

70def filter(url, args={}, meta={}, engine=None, table=None, ext=None, 

71 query=None, queryfile=None, transform=None, transform_kwargs=None, **kwargs): 

72 ''' 

73 Filters data using URL query parameters. Typical usage:: 

74 

75 filtered = gramex.data.filter(dataframe, args=handler.args) 

76 filtered = gramex.data.filter('file.csv', args=handler.args) 

77 filtered = gramex.data.filter('mysql://server/db', table='table', args=handler.args) 

78 

79 It accepts the following parameters: 

80 

81 :arg source url: Pandas DataFrame, sqlalchemy URL, directory or file name, 

82 http(s) data file, all `.format``-ed using ``args``. 

83 :arg dict args: URL query parameters as a dict of lists. Pass handler.args or parse_qs results 

84 :arg dict meta: this dict is updated with metadata during the course of filtering 

85 :arg str engine: over-rides the auto-detected engine. Can be 'dataframe', 'file', 

86 'http', 'https', 'sqlalchemy', 'dir' 

87 :arg str table: table name (if url is an SQLAlchemy URL), ``.format``-ed 

88 using ``args``. 

89 :arg str ext: file extension (if url is a file). Defaults to url extension 

90 :arg str query: optional SQL query to execute (if url is a database), 

91 ``.format``-ed using ``args`` and supports SQLAlchemy SQL parameters. 

92 Loads entire result in memory before filtering. 

93 :arg str queryfile: optional SQL query file to execute (if url is a database). 

94 Same as specifying the ``query:`` in a file. Overrides ``query:`` 

95 :arg function transform: optional in-memory transform of source data. Takes 

96 the result of gramex.cache.open or gramex.cache.query. Must return a 

97 DataFrame. Applied to both file and SQLAlchemy urls. 

98 :arg dict transform_kwargs: optional keyword arguments to be passed to the 

99 transform function -- apart from data 

100 :arg dict kwargs: Additional parameters are passed to 

101 :py:func:`gramex.cache.open` or ``sqlalchemy.create_engine`` 

102 :return: a filtered DataFrame 

103 

104 Remaining kwargs are passed to :py:func:`gramex.cache.open` if ``url`` is a file, or 

105 ``sqlalchemy.create_engine`` if ``url`` is a SQLAlchemy URL. 

106 

107 If this is used in a handler as:: 

108 

109 filtered = gramex.data.filter(dataframe, args=handler.args) 

110 

111 ... then calling the handler with ``?x=1&y=2`` returns all rows in 

112 ``dataframe`` where x is 1 and y is 2. 

113 

114 If a table or query is passed to an SQLAlchemy url, it is formatted using 

115 ``args``. For example:: 

116 

117 data = gramex.data.filter('mysql://server/db', table='{xxx}', args=handler.args) 

118 

119 ... when passed ``?xxx=sales`` returns rows from the sales table. Similarly:: 

120 

121 data = gramex.data.filter('mysql://server/db', args=handler.args, 

122 query='SELECT {col}, COUNT(*) FROM table GROUP BY {col}') 

123 

124 ... when passsed ``?col=City`` replaces ``{col}`` with ``City``. 

125 

126 **NOTE**: To avoid SQL injection attacks, only values without spaces are 

127 allowed. So ``?col=City Name`` or ``?col=City+Name`` **will not** work. 

128 

129 The URL supports operators filter like this: 

130 

131 - ``?x`` selects x is not null 

132 - ``?x!`` selects x is null 

133 - ``?x=val`` selects x == val 

134 - ``?x!=val`` selects x != val 

135 - ``?x>=val`` selects x > val 

136 - ``?x>~=val`` selects x >= val 

137 - ``?x<=val`` selects x < val 

138 - ``?x<~=val`` selects x <= val 

139 - ``?x~=val`` selects x matches val as a regular expression 

140 - ``?x!~=val`` selects x does not match val as a regular expression 

141 

142 Multiple filters are combined into an AND clause. Ranges can also be 

143 specified like this: 

144 

145 - ``?x=a&y=b`` selects x = a AND y = b 

146 - ``?x>=100&x<=200`` selects x > 100 AND x < 200 

147 

148 If the same column has multiple values, they are combined like this: 

149 

150 - ``?x=a&x=b`` selects x IN (a, b) 

151 - ``?x!=a&x!=b`` selects x NOT IN (a, b) 

152 - ``?x~=a&x~=b`` selects x ~ a|b 

153 - ``?x>=a&x>=b`` selects x > MIN(a, b) 

154 - ``?x<=a&x<=b`` selects x < MAX(a, b) 

155 

156 Arguments are converted to the type of the column before comparing. If this 

157 fails, it raises a ValueError. 

158 

159 These URL query parameters control the output: 

160 

161 - ``?_sort=col`` sorts column col in ascending order. ``?_sort=-col`` sorts 

162 in descending order. 

163 - ``?_limit=100`` limits the result to 100 rows 

164 - ``?_offset=100`` starts showing the result from row 100. Default: 0 

165 - ``?_c=x&_c=y`` returns only columns ``[x, y]``. ``?_c=-col`` drops col. 

166 

167 If a column name matches one of the above, you cannot filter by that column. 

168 Avoid column names beginning with _. 

169 

170 To get additional information about the filtering, use:: 

171 

172 meta = {} # Create a variable which will be filled with more info 

173 filtered = gramex.data.filter(data, meta=meta, **handler.args) 

174 

175 The ``meta`` variable is populated with the following keys: 

176 

177 - ``filters``: Applied filters as ``[(col, op, val), ...]`` 

178 - ``ignored``: Ignored filters as ``[(col, vals), ('_sort', col), ('_by', col), ...]`` 

179 - ``excluded``: Excluded columns as ``[col, ...]`` 

180 - ``sort``: Sorted columns as ``[(col, True), ...]``. The second parameter is ``ascending=`` 

181 - ``offset``: Offset as integer. Defaults to 0 

182 - ``limit``: Limit as integer - ``None`` if limit is not applied 

183 - ``count``: Total number of rows, if available 

184 - ``by``: Group by columns as ``[col, ...]`` 

185 

186 These variables may be useful to show additional information about the 

187 filtered data. 

188 ''' 

189 # Auto-detect engine. 

190 if engine is None: 

191 engine = get_engine(url) 

192 

193 # Pass the meta= argument from kwargs (if any) 

194 meta.update({ 

195 'filters': [], # Applied filters as [(col, op, val), ...] 

196 'ignored': [], # Ignored filters as [(col, vals), ...] 

197 'sort': [], # Sorted columns as [(col, asc), ...] 

198 'offset': 0, # Offset as integer 

199 'limit': None, # Limit as integer - None if not applied 

200 'by': [], # Group by columns as [col, ...] 

201 }) 

202 controls = _pop_controls(args) 

203 transform = _transform_fn(transform, transform_kwargs) 

204 url, table, ext, query, queryfile, kwargs = _replace( 

205 engine, args, url, table, ext, query, queryfile, **kwargs) 

206 

207 # Use the appropriate filter function based on the engine 

208 if engine == 'dataframe': 

209 data = transform(url) if callable(transform) else url 

210 return _filter_frame(data, meta=meta, controls=controls, args=args) 

211 elif engine == 'dir': 

212 data = dirstat(url, **args) 

213 data = transform(data) if callable(transform) else data 

214 return _filter_frame(data, meta=meta, controls=controls, args=args) 

215 elif engine in {'file', 'http', 'https'}: 

216 if engine == 'file' and not os.path.exists(url): 

217 raise OSError('url: %s not found' % url) 

218 # Get the full dataset. Then filter it 

219 data = gramex.cache.open(url, ext, transform=transform, **kwargs) 

220 return _filter_frame(data, meta=meta, controls=controls, args=args) 

221 elif engine == 'sqlalchemy': 

222 engine = create_engine(url, **kwargs) 

223 if query or queryfile: 

224 if queryfile: 

225 query = gramex.cache.open(queryfile, 'text') 

226 state = None 

227 if isinstance(table, six.string_types): 

228 state = table if ' ' in table else [table] 

229 elif isinstance(table, (list, tuple)): 

230 state = [t for t in table] 

231 elif table is not None: 

232 raise ValueError('table: must be string or list of strings, not %r' % table) 

233 all_params = {k: v[0] for k, v in args.items() if len(v) > 0} 

234 data = gramex.cache.query(text(query), engine, state, params=all_params) 

235 data = transform(data) if callable(transform) else data 

236 return _filter_frame(data, meta=meta, controls=controls, args=args) 

237 elif table: 237 ↛ 244line 237 didn't jump to line 244, because the condition on line 237 was never false

238 if callable(transform): 

239 data = gramex.cache.query(table, engine, [table]) 

240 return _filter_frame(transform(data), meta=meta, controls=controls, args=args) 

241 else: 

242 return _filter_db(engine, table, meta=meta, controls=controls, args=args) 

243 else: 

244 raise ValueError('No table: or query: specified') 

245 else: 

246 raise ValueError('engine: %s invalid. Can be sqlalchemy|file|dataframe' % engine) 

247 

248 

249def delete(url, meta={}, args=None, engine=None, table=None, ext=None, id=None, 

250 query=None, queryfile=None, transform=None, transform_kwargs={}, **kwargs): 

251 ''' 

252 Deletes data using URL query parameters. Typical usage:: 

253 

254 count = gramex.data.delete(dataframe, args=handler.args, id=['id']) 

255 count = gramex.data.delete('file.csv', args=handler.args, id=['id']) 

256 count = gramex.data.delete('mysql://server/db', table='table', args=handler.args, id='id') 

257 

258 ``id`` is a column name or a list of column names defining the primary key. 

259 Calling this in a handler with ``?id=1&id=2`` deletes rows with id is 1 or 2. 

260 

261 It accepts the same parameters as :py:func:`filter`, and returns the number 

262 of deleted rows. 

263 ''' 

264 if engine is None: 264 ↛ 266line 264 didn't jump to line 266, because the condition on line 264 was never false

265 engine = get_engine(url) 

266 meta.update({'filters': [], 'ignored': []}) 

267 controls = _pop_controls(args) 

268 url, table, ext, query, queryfile, kwargs = _replace( 

269 engine, args, url, table, ext, query, queryfile, **kwargs) 

270 if engine == 'dataframe': 270 ↛ 271line 270 didn't jump to line 271, because the condition on line 270 was never true

271 data_filtered = _filter_frame(url, meta=meta, controls=controls, 

272 args=args, source='delete', id=id) 

273 return len(data_filtered) 

274 elif engine == 'file': 

275 data = gramex.cache.open(url, ext, transform=transform, **kwargs) 

276 data_filtered = _filter_frame(data, meta=meta, controls=controls, 

277 args=args, source='delete', id=id) 

278 gramex.cache.save(data, url, ext, index=False, **kwargs) 

279 return len(data_filtered) 

280 elif engine == 'sqlalchemy': 280 ↛ 287line 280 didn't jump to line 287, because the condition on line 280 was never false

281 if table is None: 281 ↛ 282line 281 didn't jump to line 282, because the condition on line 281 was never true

282 raise ValueError('No table: specified') 

283 engine = create_engine(url, **kwargs) 

284 return _filter_db(engine, table, meta=meta, controls=controls, args=args, 

285 source='delete', id=id) 

286 else: 

287 raise ValueError('engine: %s invalid. Can be sqlalchemy|file|dataframe' % engine) 

288 return 0 

289 

290 

291def update(url, meta={}, args=None, engine=None, table=None, ext=None, id=None, 

292 query=None, queryfile=None, transform=None, transform_kwargs={}, **kwargs): 

293 ''' 

294 Update data using URL query parameters. Typical usage:: 

295 

296 count = gramex.data.update(dataframe, args=handler.args, id=['id']) 

297 count = gramex.data.update('file.csv', args=handler.args, id=['id']) 

298 count = gramex.data.update('mysql://server/db', table='table', args=handler.args, id='id') 

299 

300 ``id`` is a column name or a list of column names defining the primary key. 

301 Calling this in a handler with ``?id=1&x=2`` updates x=2 where id=1. 

302 

303 It accepts the same parameters as :py:func:`filter`, and returns the number of updated rows. 

304 ''' 

305 if engine is None: 305 ↛ 307line 305 didn't jump to line 307, because the condition on line 305 was never false

306 engine = get_engine(url) 

307 meta.update({'filters': [], 'ignored': []}) 

308 controls = _pop_controls(args) 

309 url, table, ext, query, queryfile, kwargs = _replace( 

310 engine, args, url, table, ext, query, queryfile, **kwargs) 

311 if engine == 'dataframe': 

312 data_updated = _filter_frame( 

313 url, meta=meta, controls=controls, args=args, source='update', id=id) 

314 return len(data_updated) 

315 elif engine == 'file': 

316 data = gramex.cache.open(url, ext, transform=transform, **kwargs) 

317 data_updated = _filter_frame( 

318 data, meta=meta, controls=controls, args=args, source='update', id=id) 

319 gramex.cache.save(data, url, ext, index=False, **kwargs) 

320 return len(data_updated) 

321 elif engine == 'sqlalchemy': 321 ↛ 328line 321 didn't jump to line 328, because the condition on line 321 was never false

322 if table is None: 322 ↛ 323line 322 didn't jump to line 323, because the condition on line 322 was never true

323 raise ValueError('No table: specified') 

324 engine = create_engine(url, **kwargs) 

325 return _filter_db(engine, table, meta=meta, controls=controls, args=args, 

326 source='update', id=id) 

327 else: 

328 raise ValueError('engine: %s invalid. Can be sqlalchemy|file|dataframe' % engine) 

329 return 0 

330 

331 

332def insert(url, meta={}, args=None, engine=None, table=None, ext=None, id=None, 

333 query=None, queryfile=None, transform=None, transform_kwargs={}, **kwargs): 

334 ''' 

335 Insert data using URL query parameters. Typical usage:: 

336 

337 count = gramex.data.insert(dataframe, args=handler.args, id=['id']) 

338 count = gramex.data.insert('file.csv', args=handler.args, id=['id']) 

339 count = gramex.data.insert('mysql://server/db', table='table', args=handler.args, id='id') 

340 

341 ``id`` is a column name or a list of column names defining the primary key. 

342 Calling this in a handler with ``?id=3&x=2`` inserts a new record with id=3 and x=2. 

343 

344 If the target file / table does not exist, it is created. 

345 

346 It accepts the same parameters as :py:func:`filter`, and returns the number of updated rows. 

347 ''' 

348 if engine is None: 348 ↛ 350line 348 didn't jump to line 350, because the condition on line 348 was never false

349 engine = get_engine(url) 

350 _pop_controls(args) 

351 meta.update({'filters': [], 'ignored': []}) 

352 # If values do not have equal number of elements, pad them and warn 

353 rowcount = max(len(val) for val in args.values()) 

354 for key, val in args.items(): 

355 rows = len(val) 

356 if 0 < rows < rowcount: 

357 val += [val[-1]] * (rowcount - rows) 

358 app_log.warning('data.insert: column %s has %d rows not %d. Extended last value %s', 

359 key, rows, rowcount, val[-1]) 

360 rows = pd.DataFrame.from_dict(args) 

361 url, table, ext, query, queryfile, kwargs = _replace( 

362 engine, args, url, table, ext, query, queryfile, **kwargs) 

363 if engine == 'dataframe': 363 ↛ 364line 363 didn't jump to line 364, because the condition on line 363 was never true

364 rows = _pop_columns(rows, url.columns, meta['ignored']) 

365 url = url.append(rows, sort=False) 

366 return len(rows) 

367 elif engine == 'file': 

368 try: 

369 data = gramex.cache.open(url, ext, transform=None, **kwargs) 

370 except (OSError, IOError): 

371 data = rows 

372 else: 

373 rows = _pop_columns(rows, data.columns, meta['ignored']) 

374 data = data.append(rows, sort=False) 

375 gramex.cache.save(data, url, ext, index=False, **kwargs) 

376 return len(rows) 

377 elif engine == 'sqlalchemy': 377 ↛ 395line 377 didn't jump to line 395, because the condition on line 377 was never false

378 if table is None: 378 ↛ 379line 378 didn't jump to line 379, because the condition on line 378 was never true

379 raise ValueError('No table: specified') 

380 engine = create_engine(url, **kwargs) 

381 try: 

382 cols = get_table(engine, table).columns 

383 except sqlalchemy.exc.NoSuchTableError: 

384 pass 

385 else: 

386 rows = _pop_columns(rows, [col.name for col in cols], meta['ignored']) 

387 if '.' in table: 387 ↛ 388line 387 didn't jump to line 388, because the condition on line 387 was never true

388 kwargs['schema'], table = table.rsplit('.', 1) 

389 # pandas does not document engine.dialect.has_table so it might change. 

390 if not engine.dialect.has_table(engine, table) and id: 

391 engine.execute(pd.io.sql.get_schema(rows, name=table, keys=id, con=engine)) 

392 rows.to_sql(table, engine, if_exists='append', index=False, **kwargs) 

393 return len(rows) 

394 else: 

395 raise ValueError('engine: %s invalid. Can be sqlalchemy|file|dataframe' % engine) 

396 return 0 

397 

398 

399def get_engine(url): 

400 ''' 

401 Used to detect type of url passed. Returns: 

402 

403 - ``'dataframe'`` if url is a Pandas DataFrame 

404 - ``'sqlalchemy'`` if url is a sqlalchemy compatible URL 

405 - ``protocol`` if url is of the form `protocol://...` 

406 - ``'dir'`` if it is not a URL but a valid directory 

407 - ``'file'`` if it is not a URL but a valid file 

408 

409 Else it raises an Exception 

410 ''' 

411 if isinstance(url, pd.DataFrame): 

412 return 'dataframe' 

413 try: 

414 url = sqlalchemy.engine.url.make_url(url) 

415 except sqlalchemy.exc.ArgumentError: 

416 return 'dir' if os.path.isdir(url) else 'file' 

417 try: 

418 url.get_driver_name() 

419 return 'sqlalchemy' 

420 except sqlalchemy.exc.NoSuchModuleError: 

421 return url.drivername 

422 

423 

424def create_engine(url, **kwargs): 

425 ''' 

426 Cached version of sqlalchemy.create_engine. 

427 

428 Normally, this is not required. But :py:func:`get_table` caches the engine 

429 *and* metadata *and* uses autoload=True. This makes sqlalchemy create a new 

430 database connection for every engine object, and not dispose it. So we 

431 re-use the engine objects within this module. 

432 ''' 

433 if url not in _ENGINE_CACHE: 

434 _ENGINE_CACHE[url] = sqlalchemy.create_engine(url, **kwargs) 

435 return _ENGINE_CACHE[url] 

436 

437 

438def get_table(engine, table): 

439 '''Return the sqlalchemy table from the engine and table name''' 

440 if engine not in _METADATA_CACHE: 

441 _METADATA_CACHE[engine] = sqlalchemy.MetaData() 

442 metadata = _METADATA_CACHE[engine] 

443 if '.' in table: 

444 schema, tbl = table.rsplit('.', 1) 

445 return sqlalchemy.Table(tbl, metadata, autoload=True, autoload_with=engine, schema=schema) 

446 else: 

447 return sqlalchemy.Table(table, metadata, autoload=True, autoload_with=engine) 

448 

449 

450def _pop_controls(args): 

451 '''Filter out data controls: sort, limit, offset and column (_c) from args''' 

452 return { 

453 key: args.pop(key) 

454 for key in ('_sort', '_limit', '_offset', '_c', '_by') 

455 if key in args 

456 } 

457 

458 

459def _pop_columns(data, cols, ignored): 

460 '''Remove columns not in cols''' 

461 cols = set(cols) 

462 for col in data.columns: 

463 if col not in cols: 

464 ignored.append([col, data[col].tolist()]) 

465 return data[[col for col in cols if col in data.columns]] 

466 

467 

468def _sql_safe(val): 

469 '''Return True if val is safe for insertion in an SQL query''' 

470 if isinstance(val, six.string_types): 

471 return not re.search(r'\s', val) 

472 elif isinstance(val, six.integer_types) or isinstance(val, (float, bool)): 

473 return True 

474 return False 

475 

476 

477def _path_safe(path): 

478 '''Returns True if path does not try to escape outside a given directory using .. or / etc''' 

479 # Ignore non-strings. These are generally not meant for paths 

480 if not isinstance(path, six.string_types): 

481 return True 

482 return os.path.realpath(os.path.join(_path_safe_root, path)).startswith(_path_safe_root) 

483 

484 

485# The order of operators is important. ~ is at the end. Otherwise, !~ 

486# or >~ will also be mapped to ~ as an operator 

487operators = ['!', '>', '>~', '<', '<~', '!~', '~'] 

488 

489 

490def _filter_col(col, cols): 

491 ''' 

492 Parses a column name from a list of columns and returns a (col, agg, op) 

493 tuple. 

494 

495 - ``col`` is the name of the column in cols. 

496 - ``agg`` is the aggregation operation (SUM, MIN, MAX, etc), else None 

497 - ``op`` is the operator ('', !, >, <, etc) 

498 

499 If the column is invalid, then ``col`` and ``op`` are None 

500 ''' 

501 colset = set(cols) 

502 # ?col= is returned quickly 

503 if col in colset: 

504 return col, None, '' 

505 # Check if it matches a non-empty operator, like ?col>~= 

506 for op in operators: 

507 if col.endswith(op): 

508 name = col[:-len(op)] 

509 if name in colset: 

510 return name, None, op 

511 # If there's an aggregator, split it out, like ?col|SUM>~= 

512 elif _agg_sep in name: 

513 name, agg = name.rsplit(_agg_sep, 1) 

514 if name in colset: 514 ↛ 506line 514 didn't jump to line 506, because the condition on line 514 was never false

515 return name, agg, op 

516 # If no operators match, it might be a pure aggregation, like ?col|SUM= 

517 if _agg_sep in col: 

518 name, agg = col.rsplit(_agg_sep, 1) 

519 if name in colset: 519 ↛ 522line 519 didn't jump to line 522, because the condition on line 519 was never false

520 return name, agg, '' 

521 # Otherwise we don't know what it is 

522 return None, None, None 

523 

524 

525def _filter_frame_col(data, key, col, op, vals, meta): 

526 # Apply type conversion for values 

527 conv = data[col].dtype.type 

528 vals = tuple(conv(val) for val in vals if val) 

529 if op not in {'', '!'} and len(vals) == 0: 

530 meta['ignored'].append((key, vals)) 

531 elif op == '': 

532 data = data[data[col].isin(vals)] if len(vals) else data[pd.notnull(data[col])] 

533 elif op == '!': 

534 data = data[~data[col].isin(vals)] if len(vals) else data[pd.isnull(data[col])] 

535 elif op == '>': 

536 data = data[data[col] > min(vals)] 

537 elif op == '>~': 

538 data = data[data[col] >= min(vals)] 

539 elif op == '<': 

540 data = data[data[col] < max(vals)] 

541 elif op == '<~': 

542 data = data[data[col] <= max(vals)] 

543 elif op == '!~': 

544 data = data[~data[col].str.contains('|'.join(vals))] 

545 elif op == '~': 545 ↛ 547line 545 didn't jump to line 547, because the condition on line 545 was never false

546 data = data[data[col].str.contains('|'.join(vals))] 

547 meta['filters'].append((col, op, vals)) 

548 return data 

549 

550 

551def _filter_db_col(query, method, key, col, op, vals, column, conv, meta): 

552 ''' 

553 - Updates ``query`` with a method (WHERE/HAVING) that sets '<key> <op> <vals>' 

554 - ``column`` is the underlying ColumnElement 

555 - ``conv`` is a type conversion function that converts ``vals`` to the correct type 

556 - Updates ``meta`` with the fields used for filtering (or ignored) 

557 ''' 

558 # In PY2, .python_type returns str. We want unicode 

559 sql_types = {six.binary_type: six.text_type, pd.datetime: six.text_type} 

560 conv = sql_types.get(conv, conv) 

561 vals = tuple(conv(val) for val in vals if val) 

562 if op not in {'', '!'} and len(vals) == 0: 

563 meta['ignored'].append((key, vals)) 

564 elif op == '': 

565 # Test if column is not NULL. != None is NOT the same as is not None 

566 query = method(column.in_(vals) if len(vals) else column != None) # noqa 

567 elif op == '!': 

568 # Test if column is NULL. == None is NOT the same as is None 

569 query = method(column.notin_(vals) if len(vals) else column == None) # noqa 

570 elif op == '>': 

571 query = method(column > min(vals)) 

572 elif op == '>~': 

573 query = method(column >= min(vals)) 

574 elif op == '<': 

575 query = method(column < max(vals)) 

576 elif op == '<~': 

577 query = method(column <= max(vals)) 

578 elif op == '!~': 

579 query = method(column.notlike('%' + '%'.join(vals) + '%')) 

580 elif op == '~': 580 ↛ 582line 580 didn't jump to line 582, because the condition on line 580 was never false

581 query = method(column.like('%' + '%'.join(vals) + '%')) 

582 meta['filters'].append((col, op, vals)) 

583 return query 

584 

585 

586def _filter_sort_columns(sort_filter, cols): 

587 sorts, ignore_sorts = [], [] 

588 for col in sort_filter: 

589 if col in cols: 

590 sorts.append((col, True)) 

591 elif col.startswith('-') and col[1:] in cols: 

592 sorts.append((col[1:], False)) 

593 else: 

594 ignore_sorts.append(col) 

595 return sorts, ignore_sorts 

596 

597 

598def _filter_select_columns(col_filter, cols, meta): 

599 ''' 

600 Checks ?_c=col&_c=-col for filter(). Takes values of ?_c= as col_filter and 

601 data column names as cols. Returns 2 lists: show_cols as columns to show. 

602 ignored_cols has column names not in the list, i.e. the ?_c= parameters that 

603 are ignored. 

604 ''' 

605 selected_cols, excluded_cols, ignored_cols = [], set(), [] 

606 for col in col_filter: 

607 if col in cols: 

608 selected_cols.append(col) 

609 elif col.startswith('-') and col[1:] in cols: 

610 excluded_cols.add(col[1:]) 

611 else: 

612 ignored_cols.append(col) 

613 if len(excluded_cols) > 0 and len(selected_cols) == 0: 

614 selected_cols = cols 

615 show_cols = [col for col in selected_cols if col not in excluded_cols] 

616 meta['excluded'] = list(excluded_cols) 

617 return show_cols, ignored_cols 

618 

619 

620def _filter_groupby_columns(by, cols, meta): 

621 ''' 

622 Checks ?_by=col&_by=col for filter(). 

623 

624 - ``by``: list of column names to group by 

625 - ``cols``: list of valid column names 

626 - ``meta``: meta['by'] and meta['ignored'] are updated 

627 

628 Returns a list of columns to group by 

629 ''' 

630 colset = set(cols) 

631 for col in by: 

632 if col in colset: 

633 meta['by'].append(col) 

634 else: 

635 meta['ignored'].append(('_by', col)) 

636 return meta['by'] 

637 

638 

639# If ?by=col|avg is provided, this works in SQL but not in Pandas DataFrames. 

640# Convert into a DataFrame friendly function 

641_frame_functions = { 

642 'avg': 'mean', 

643 'average': 'mean', 

644} 

645 

646 

647def _filter_frame(data, meta, controls, args, source='select', id=[]): 

648 ''' 

649 If ``source`` is ``'select'``, returns a DataFrame in which the DataFrame 

650 ``data`` is filtered using ``args``. Additional controls like _sort, etc are 

651 in ``controls``. Metadata is stored in ``meta``. 

652 

653 If ``source`` is ``'update'``, filters using ``args`` but only for columns 

654 mentioned in ``id``. Resulting DataFrame is updated with remaining ``args``. 

655 Returns the updated rows. 

656 

657 If ``source`` is ``'delete'``, filters using ``args`` but only for columns 

658 mentioned in ``id``. Deletes these rows. Returns the deleted rows. 

659 

660 :arg data: dataframe 

661 :arg meta: dictionary of `filters`, `ignored`, `sort`, `offset`, `limit` params from kwargs 

662 :arg args: user arguments to filter the data 

663 :arg source: accepted values - `update`, `delete` for PUT, DELETE methods in FormHandler 

664 :arg id: list of id specific to data using which values can be updated 

665 ''' 

666 original_data = data 

667 cols_for_update = {} 

668 cols_having = [] 

669 for key, vals in args.items(): 

670 # check if `key`` is in the `id` list -- ONLY when data is updated 

671 if (source in ('update', 'delete') and key in id) or (source == 'select'): 

672 # Parse column names, ignoring missing / unmatched columns 

673 col, agg, op = _filter_col(key, data.columns) 

674 if col is None: 

675 meta['ignored'].append((key, vals)) 

676 continue 

677 # Process aggregated columns AFTER filtering, not before (like HAVING clause) 

678 # e.g. ?sales|SUM=<val> should be applied only after the column is created 

679 if agg is not None: 

680 cols_having.append((key, col + _agg_sep + agg, op, vals)) 

681 continue 

682 # Apply filters 

683 data = _filter_frame_col(data, key, col, op, vals, meta) 

684 elif source == 'update': 

685 # Update values should only contain 1 value. 2nd onwards are ignored 

686 if key not in data.columns or len(vals) == 0: 

687 meta['ignored'].append((key, vals)) 

688 else: 

689 cols_for_update[key] = vals[0] 

690 if len(vals) > 1: 690 ↛ 691line 690 didn't jump to line 691, because the condition on line 690 was never true

691 meta['ignored'].append((key, vals[1:])) 

692 else: 

693 meta['ignored'].append((key, vals)) 

694 meta['count'] = len(data) 

695 if source == 'delete': 

696 original_data.drop(data.index, inplace=True) 

697 return data 

698 elif source == 'update': 

699 conv = {k: v.type for k, v in data.dtypes.items()} 

700 for key, val in cols_for_update.items(): 

701 original_data.loc[data.index, key] = conv[key](val) 

702 return data 

703 else: 

704 # Apply controls 

705 if '_by' in controls: 

706 by = _filter_groupby_columns(controls['_by'], data.columns, meta) 

707 # If ?_c is not specified, use 'col|sum' for all numeric columns 

708 # TODO: This does not support ?_c=-<col> to hide a column 

709 col_list = controls.get('_c', None) 

710 if col_list is None: 

711 col_list = [col + _agg_sep + 'sum' for col in data.columns # noqa 

712 if pd.api.types.is_numeric_dtype(data[col])] 

713 agg_cols = [] 

714 agg_dict = AttrDict() 

715 for key in col_list: 

716 col, agg, val = _filter_col(key, data.columns) 

717 if agg is not None: 

718 # Convert aggregation into a Pandas GroupBy agg function 

719 agg = agg.lower() 

720 agg = _frame_functions.get(agg, agg) 

721 agg_cols.append(key) 

722 if col in agg_dict: 

723 agg_dict[col].append(agg) 

724 else: 

725 agg_dict[col] = [agg] 

726 if len(by) > 0: 

727 if not agg_cols: 

728 # If no aggregation columns exist, just show groupby columns. 

729 data = data.groupby(by).agg('size').reset_index() 

730 data = data.iloc[:, [0]] 

731 else: 

732 data = data.groupby(by).agg(agg_dict) 

733 data.columns = agg_cols 

734 data = data.reset_index() 

735 # Apply HAVING operators 

736 for key, col, op, vals in cols_having: 

737 data = _filter_frame_col(data, key, col, op, vals, meta) 

738 else: 

739 row = [data[col].agg(op) for col, ops in agg_dict.items() for op in ops] 

740 data = pd.DataFrame([row], columns=agg_cols) 

741 elif '_c' in controls: 

742 show_cols, hide_cols = _filter_select_columns(controls['_c'], data.columns, meta) 

743 data = data[show_cols] 

744 if len(hide_cols) > 0: 

745 meta['ignored'].append(('_c', hide_cols)) 

746 if '_sort' in controls: 

747 meta['sort'], ignore_sorts = _filter_sort_columns(controls['_sort'], data.columns) 

748 if len(meta['sort']) > 0: 

749 data = data.sort_values(by=[c[0] for c in meta['sort']], 

750 ascending=[c[1] for c in meta['sort']]) 

751 if len(ignore_sorts) > 0: 

752 meta['ignored'].append(('_sort', ignore_sorts)) 

753 if '_offset' in controls: 

754 try: 

755 offset = min(int(v) for v in controls['_offset']) 

756 except ValueError: 

757 raise ValueError('_offset not integer: %r' % controls['_offset']) 

758 data = data.iloc[offset:] 

759 meta['offset'] = offset 

760 if '_limit' in controls: 

761 try: 

762 limit = min(int(v) for v in controls['_limit']) 

763 except ValueError: 

764 raise ValueError('_limit not integer: %r' % controls['_limit']) 

765 data = data.iloc[:limit] 

766 meta['limit'] = limit 

767 return data 

768 

769 

770def _filter_db(engine, table, meta, controls, args, source='select', id=[]): 

771 ''' 

772 

773 It accepts the following parameters 

774 

775 :arg sqlalchemy engine engine: constructed sqlalchemy string 

776 :arg database table table: table name in the mentioned database 

777 :arg controls: dictionary of `_sort`, `_c`, `_offset`, `_limit` params 

778 :arg meta: dictionary of `filters`, `ignored`, `sort`, `offset`, `limit` params from kwargs 

779 :arg args: dictionary of user arguments to filter the data 

780 :arg source: accepted values - `update`, `delete` for PUT, DELETE methods in FormHandler 

781 :arg id: list of keys specific to data using which values can be updated 

782 ''' 

783 table = get_table(engine, table) 

784 cols = table.columns 

785 colslist = cols.keys() 

786 

787 if source == 'delete': 

788 query = sqlalchemy.delete(table) 

789 elif source == 'update': 

790 query = sqlalchemy.update(table) 

791 else: 

792 query = sqlalchemy.select([table]) 

793 cols_for_update = {} 

794 cols_having = [] 

795 for key, vals in args.items(): 

796 # check if `key`` is in the `id` list -- ONLY when data is updated 

797 if (source in ('update', 'delete') and key in id) or (source == 'select'): 

798 # Parse column names, ignoring missing / unmatched columns 

799 col, agg, op = _filter_col(key, colslist) 

800 if col is None: 

801 meta['ignored'].append((key, vals)) 

802 continue 

803 # Process aggregated columns AFTER filtering, not before (like HAVING clause) 

804 # e.g. ?sales|SUM=<val> should be applied only after the column is created 

805 if agg is not None: 

806 cols_having.append((key, col + _agg_sep + agg, op, vals)) 

807 continue 

808 # Apply filters 

809 query = _filter_db_col(query, query.where, key, col, op, vals, 

810 cols[col], cols[col].type.python_type, meta) 

811 elif source == 'update': 

812 # Update values should only contain 1 value. 2nd onwards are ignored 

813 if key not in cols or len(vals) == 0: 

814 meta['ignored'].append((key, vals)) 

815 else: 

816 cols_for_update[key] = vals[0] 

817 if len(vals) > 1: 817 ↛ 818line 817 didn't jump to line 818, because the condition on line 817 was never true

818 meta['ignored'].append((key, vals[1:])) 

819 else: 

820 meta['ignored'].append((key, vals)) 

821 if source == 'delete': 

822 res = engine.execute(query) 

823 return res.rowcount 

824 elif source == 'update': 

825 query = query.values(cols_for_update) 

826 res = engine.execute(query) 

827 return res.rowcount 

828 else: 

829 # Apply controls 

830 if '_by' in controls: 

831 by = _filter_groupby_columns(controls['_by'], colslist, meta) 

832 query = query.group_by(*by) 

833 # If ?_c is not specified, use 'col|sum' for all numeric columns 

834 # TODO: This does not support ?_c=-<col> to hide a column 

835 col_list = controls.get('_c', None) 

836 if col_list is None: 

837 col_list = [col + _agg_sep + 'sum' for col, column in cols.items() # noqa 

838 if column.type.python_type.__name__ in _numeric_types] 

839 agg_cols = AttrDict([(col, cols[col]) for col in by]) # {label: ColumnElement} 

840 typ = {} # {label: python type} 

841 for key in col_list: 

842 col, agg, val = _filter_col(key, colslist) 

843 if agg is not None: 

844 # Convert aggregation into SQLAlchemy query 

845 agg = agg.lower() 

846 typ[key] = _agg_type.get(agg, cols[col].type.python_type) 

847 agg_func = getattr(sqlalchemy.sql.expression.func, agg) 

848 agg_cols[key] = agg_func(cols[col]).label(key) 

849 if not agg_cols: 

850 return pd.DataFrame() 

851 query = query.with_only_columns(agg_cols.values()) 

852 # Apply HAVING operators 

853 for key, col, op, vals in cols_having: 

854 query = _filter_db_col(query, query.having, key, col, op, vals, 

855 agg_cols[col], typ[col], meta) 

856 elif '_c' in controls: 

857 show_cols, hide_cols = _filter_select_columns(controls['_c'], colslist, meta) 

858 query = query.with_only_columns([cols[col] for col in show_cols]) 

859 if len(hide_cols) > 0: 

860 meta['ignored'].append(('_c', hide_cols)) 

861 if len(show_cols) == 0: 861 ↛ 862line 861 didn't jump to line 862, because the condition on line 861 was never true

862 return pd.DataFrame() 

863 if '_sort' in controls: 

864 meta['sort'], ignore_sorts = _filter_sort_columns( 

865 controls['_sort'], colslist + query.columns.keys()) 

866 for col, asc in meta['sort']: 

867 orderby = sqlalchemy.asc if asc else sqlalchemy.desc 

868 query = query.order_by(orderby(col)) 

869 if len(ignore_sorts) > 0: 

870 meta['ignored'].append(('_sort', ignore_sorts)) 

871 if '_offset' in controls: 

872 try: 

873 offset = min(int(v) for v in controls['_offset']) 

874 except ValueError: 

875 raise ValueError('_offset not integer: %r' % controls['_offset']) 

876 query = query.offset(offset) 

877 meta['offset'] = offset 

878 if '_limit' in controls: 

879 try: 

880 limit = min(int(v) for v in controls['_limit']) 

881 except ValueError: 

882 raise ValueError('_limit not integer: %r' % controls['_limit']) 

883 query = query.limit(limit) 

884 meta['limit'] = limit 

885 return pd.read_sql(query, engine) 

886 

887 

888_VEGA_SCRIPT = os.path.join(_FOLDER, 'download.vega.js') 

889 

890 

891def download(data, format='json', template=None, args={}, **kwargs): 

892 ''' 

893 Download a DataFrame or dict of DataFrames in various formats. This is used 

894 by :py:class:`gramex.handlers.FormHandler`. You are **strongly** advised to 

895 try it before creating your own FunctionHandler. 

896 

897 Usage as a FunctionHandler:: 

898 

899 def download_as_csv(handler): 

900 handler.set_header('Content-Type', 'text/csv') 

901 handler.set_header('Content-Disposition', 'attachment;filename=data.csv') 

902 return gramex.data.download(dataframe, format='csv') 

903 

904 It takes the following arguments: 

905 

906 :arg dataset data: A DataFrame or a dict of DataFrames 

907 :arg str format: Output format. Can be ``csv|json|html|xlsx|template`` 

908 :arg file template: Path to template file for ``template`` format 

909 :arg dict args: dictionary of user arguments to subsitute spec 

910 :arg dict kwargs: Additional parameters that are passed to the relevant renderer 

911 :return: bytes with the download file contents 

912 

913 When ``data`` is a DataFrame, this is what different ``format=`` parameters 

914 return: 

915 

916 - ``csv`` returns a UTF-8-BOM encoded CSV file of the dataframe 

917 - ``xlsx`` returns an Excel file with 1 sheet named ``data``. kwargs are 

918 passed to ``.to_excel(index=False)`` 

919 - ``html`` returns a HTML file with a single table. kwargs are passed to 

920 ``.to_html(index=False)`` 

921 - ``json`` returns a JSON file. kwargs are passed to 

922 ``.to_json(orient='records', force_ascii=True)``. 

923 - ``template`` returns a Tornado template rendered file. The template 

924 receives ``data`` as ``data`` and any additional kwargs. 

925 - ``pptx`` returns a PPTX generated by pptgen 

926 - ``seaborn`` or ``sns`` returns a Seaborn generated chart 

927 - ``vega`` returns JavaScript that renders a Vega chart 

928 

929 When ``data`` is a dict of DataFrames, the following additionally happens: 

930 

931 - ``format='csv'`` renders all DataFrames one below the other, adding the 

932 key as heading 

933 - ``format='xlsx'`` renders each DataFrame on a sheet whose name is the key 

934 - ``format='html'`` renders tables below one another with the key as heading 

935 - ``format='json'`` renders as a dict of DataFrame JSONs 

936 - ``format='template'`` sends ``data`` and all ``kwargs`` as passed to the 

937 template 

938 - ``format='pptx'`` passes ``data`` as a dict of datasets to pptgen 

939 - ``format='vega'`` passes ``data`` as a dict of datasets to Vega 

940 

941 You need to set the MIME types on the handler yourself. Recommended MIME 

942 types are in gramex.yaml under handler.FormHandler. 

943 ''' 

944 if isinstance(data, dict): 

945 for key, val in data.items(): 

946 if not isinstance(val, pd.DataFrame): 946 ↛ 947line 946 didn't jump to line 947, because the condition on line 946 was never true

947 raise ValueError('download({"%s": %r}) invalid type' % (key, type(val))) 

948 if not len(data): 948 ↛ 949line 948 didn't jump to line 949, because the condition on line 948 was never true

949 raise ValueError('download() data requires at least 1 DataFrame') 

950 multiple = True 

951 elif not isinstance(data, pd.DataFrame): 951 ↛ 952line 951 didn't jump to line 952, because the condition on line 951 was never true

952 raise ValueError('download(%r) invalid type' % type(data)) 

953 else: 

954 data = {'data': data} 

955 multiple = False 

956 

957 def kw(**conf): 

958 return merge(kwargs, conf, mode='setdefault') 

959 

960 if format == 'csv': 

961 # csv.writer requires BytesIO in PY2 and StringIO in PY3. 

962 # I can't see an elegant way out of this other than writing code for each. 

963 if six.PY2: 963 ↛ 964line 963 didn't jump to line 964, because the condition on line 963 was never true

964 out = io.BytesIO() 

965 kw(index=False, encoding='utf-8') 

966 for index, (key, val) in enumerate(data.items()): 

967 if index > 0: 

968 out.write(b'\n') 

969 if multiple: 

970 out.write(key.encode('utf-8') + b'\n') 

971 val.to_csv(out, **kwargs) 

972 result = out.getvalue() 

973 # utf-8-sig encoding returns the result with a UTF-8 BOM. Easier to open in Excel 

974 return ''.encode('utf-8-sig') + result if result.strip() else result 

975 else: 

976 out = io.StringIO() 

977 kw(index=False) 

978 for index, (key, val) in enumerate(data.items()): 

979 if index > 0: 

980 out.write('\n') 

981 if multiple: 

982 out.write(key + '\n') 

983 val.to_csv(out, **kwargs) 

984 result = out.getvalue() 

985 # utf-8-sig encoding returns the result with a UTF-8 BOM. Easier to open in Excel 

986 return result.encode('utf-8-sig') if result.strip() else result.encode('utf-8') 

987 elif format == 'template': 987 ↛ 988line 987 didn't jump to line 988, because the condition on line 987 was never true

988 return gramex.cache.open(template, 'template').generate( 

989 data=data if multiple else data['data'], **kwargs) 

990 elif format == 'html': 

991 out = io.StringIO() 

992 kw(index=False) 

993 for key, val in data.items(): 

994 if multiple: 

995 out.write('<h1>%s</h1>' % key) 

996 val.to_html(out, **kwargs) 

997 return out.getvalue().encode('utf-8') 

998 elif format in {'xlsx', 'xls'}: 

999 out = io.BytesIO() 

1000 kw(index=False) 

1001 # TODO: Create and use a FrameWriter for formatting 

1002 with pd.ExcelWriter(out, engine='xlsxwriter') as writer: 

1003 for key, val in data.items(): 

1004 val.to_excel(writer, sheet_name=key, **kwargs) 

1005 return out.getvalue() 

1006 elif format in {'pptx', 'ppt'}: 1006 ↛ 1007line 1006 didn't jump to line 1007, because the condition on line 1006 was never true

1007 from gramex.pptgen import pptgen # noqa 

1008 out = io.BytesIO() 

1009 pptgen(target=out, data=data, is_formhandler=True, **kwargs) 

1010 return out.getvalue() 

1011 elif format in {'seaborn', 'sns'}: 

1012 kw = AttrDict() 

1013 defaults = {'chart': 'barplot', 'ext': 'png', 'data': 'data', 'dpi': 96, 

1014 'width': 640, 'height': 480} 

1015 for key, default in defaults.items(): 

1016 kw[key] = kwargs.pop(key, default) 

1017 import matplotlib 

1018 matplotlib.use('Agg') # Before importing seaborn, set a headless backend 

1019 import seaborn as sns 

1020 plot = getattr(sns, kw.chart)(data=data.get(kw.data), **kwargs) 

1021 out = io.BytesIO() 

1022 fig = plot.figure if hasattr(plot, 'figure') else plot.fig 

1023 for k in ['dpi', 'width', 'height']: 

1024 kw[k] = float(kw[k]) 

1025 fig.set_size_inches(kw.width / kw.dpi, kw.height / kw.dpi) 

1026 fig.savefig(out, format=kw.ext, dpi=kw.dpi) 

1027 fig.clear() 

1028 return out.getvalue() 

1029 elif format in {'vega', 'vega-lite', 'vegam'}: 

1030 kwargs = kw(orient='records', force_ascii=True) 

1031 spec = kwargs.pop('spec', {}) 

1032 kwargs.pop('handler', None) 

1033 out = io.BytesIO() 

1034 # conf = {..., spec: {..., data: __DATA__}} 

1035 if isinstance(spec.get('data'), (dict, list)) or 'fromjson' in spec: 

1036 # support only one dataset 

1037 values = list(data.values()) 

1038 out.write(values[0].to_json(**kwargs).encode('utf-8')) 

1039 out = out.getvalue() 

1040 else: 

1041 spec['data'] = '__DATA__' 

1042 for index, (key, val) in enumerate(data.items()): 

1043 out.write(b',{"name":' if index > 0 else b'{"name":') 

1044 out.write(json_encode(key).encode('utf-8')) 

1045 out.write(b',"values":') 

1046 out.write(val.to_json(**kwargs).encode('utf-8')) 

1047 out.write(b'}') 

1048 out = out.getvalue() 

1049 if format == 'vega': 

1050 out = b'[' + out + b']' 

1051 kwargs['spec'], _ = _replace('', args, spec) 

1052 conf = json.dumps(kwargs, ensure_ascii=True, separators=(',', ':'), indent=None) 

1053 conf = conf.encode('utf-8').replace(b'"__DATA__"', out) 

1054 script = gramex.cache.open(_VEGA_SCRIPT, 'bin') 

1055 return script.replace(b'/*{conf}*/', conf) 

1056 else: 

1057 out = io.BytesIO() 

1058 kwargs = kw(orient='records', force_ascii=True) 

1059 if multiple: 

1060 out.write(b'{') 

1061 for index, (key, val) in enumerate(data.items()): 

1062 if index > 0: 

1063 out.write(b',') 

1064 out.write(json_encode(key).encode('utf-8')) 

1065 out.write(b':') 

1066 out.write(val.to_json(**kwargs).encode('utf-8')) 

1067 out.write(b'}') 

1068 else: 

1069 out.write(data['data'].to_json(**kwargs).encode('utf-8')) 

1070 return out.getvalue() 

1071 

1072 

1073def dirstat(url, timeout=10, **kwargs): 

1074 ''' 

1075 Return a DataFrame with the list of all files & directories under the url. 

1076 

1077 It accepts the following parameters: 

1078 

1079 :arg str url: path to a directory, or a URL like ``dir:///c:/path/``, 

1080 ``dir:////root/dir/``. Raises ``OSError`` if url points to a missing 

1081 location or is not a directory. 

1082 :arg int timeout: max seconds to wait. ``None`` to wait forever. (default: 10) 

1083 :return: a DataFrame with columns: 

1084 - ``type``: extension with a ``.`` prefix -- or ``dir`` 

1085 - ``dir``: directory path to the file relative to the URL 

1086 - ``name``: file name (including extension) 

1087 - ``path``: full path to file or dir. This equals url / dir / name 

1088 - ``size``: file size 

1089 - ``mtime``: last modified time in seconds since epoch 

1090 - ``level``: path depth (i.e. the number of paths in dir) 

1091 ''' 

1092 try: 

1093 url = sqlalchemy.engine.url.make_url(url) 

1094 target = url.database 

1095 except sqlalchemy.exc.ArgumentError: 

1096 target = url 

1097 if not os.path.isdir(target): 1097 ↛ 1098line 1097 didn't jump to line 1098, because the condition on line 1097 was never true

1098 raise OSError('dirstat: %s is not a directory' % target) 

1099 target = os.path.normpath(target) 

1100 result = [] 

1101 start_time = time.time() 

1102 for dirpath, dirnames, filenames in os.walk(target): 

1103 if timeout and time.time() - start_time > timeout: 1103 ↛ 1104line 1103 didn't jump to line 1104, because the condition on line 1103 was never true

1104 app_log.debug('dirstat: %s timeout (%.1fs)', url, timeout) 

1105 break 

1106 for name in dirnames: 

1107 path = os.path.join(dirpath, name) 

1108 stat = os.stat(path) 

1109 dirname = dirpath.replace(target, '').replace(os.sep, '/') + '/' 

1110 result.append({ 

1111 'path': path, 'dir': dirname, 'name': name, 'type': 'dir', 

1112 'size': stat.st_size, 'mtime': stat.st_mtime, 'level': dirname.count('/'), 

1113 }) 

1114 for name in filenames: 

1115 path = os.path.join(dirpath, name) 

1116 stat = os.stat(path) 

1117 dirname = dirpath.replace(target, '').replace(os.sep, '/') + '/' 

1118 result.append({ 

1119 'path': path, 'dir': dirname, 'name': name, 'type': os.path.splitext(name)[-1], 

1120 'size': stat.st_size, 'mtime': stat.st_mtime, 'level': dirname.count('/'), 

1121 }) 

1122 return pd.DataFrame(result) 

1123 

1124 

1125def filtercols(url, args={}, meta={}, engine=None, table=None, ext=None, 

1126 query=None, queryfile=None, transform=None, transform_kwargs={}, **kwargs): 

1127 ''' 

1128 Filter data and extract unique values of each column using URL query parameters. 

1129 Typical usage:: 

1130 

1131 filtered = gramex.data.filtercols(dataframe, args=handler.args) 

1132 filtered = gramex.data.filtercols('file.csv', args=handler.args) 

1133 filtered = gramex.data.filtercols('mysql://server/db', table='table', args=handler.args) 

1134 

1135 It accepts the following parameters: 

1136 

1137 :arg source url: Pandas DataFrame, sqlalchemy URL, directory or file name, 

1138 `.format``-ed using ``args``. 

1139 :arg dict args: URL query parameters as a dict of lists. Pass handler.args or parse_qs results 

1140 :arg dict meta: this dict is updated with metadata during the course of filtering 

1141 :arg str engine: over-rides the auto-detected engine. Can be 'dataframe', 'file', 

1142 'http', 'https', 'sqlalchemy', 'dir' 

1143 :arg str table: table name (if url is an SQLAlchemy URL), ``.format``-ed 

1144 using ``args``. 

1145 :arg str ext: file extension (if url is a file). Defaults to url extension 

1146 :arg str query: optional SQL query to execute (if url is a database), 

1147 ``.format``-ed using ``args`` and supports SQLAlchemy SQL parameters. 

1148 Loads entire result in memory before filtering. 

1149 :arg str queryfile: optional SQL query file to execute (if url is a database). 

1150 Same as specifying the ``query:`` in a file. Overrides ``query:`` 

1151 :arg function transform: optional in-memory transform of source data. Takes 

1152 the result of gramex.cache.open or gramex.cache.query. Must return a 

1153 DataFrame. Applied to both file and SQLAlchemy urls. 

1154 :arg dict transform_kwargs: optional keyword arguments to be passed to the 

1155 transform function -- apart from data 

1156 :arg dict kwargs: Additional parameters are passed to 

1157 :py:func:`gramex.cache.open` or ``sqlalchemy.create_engine`` 

1158 :return: a filtered DataFrame 

1159 

1160 Remaining kwargs are passed to :py:func:`gramex.cache.open` if ``url`` is a file, or 

1161 ``sqlalchemy.create_engine`` if ``url`` is a SQLAlchemy URL. 

1162 

1163 If this is used in a handler as:: 

1164 

1165 filtered = gramex.data.filtercols(dataframe, args=handler.args) 

1166 

1167 ... then calling the handler with ``?_c=state&_c=district`` returns all unique values 

1168 in columns of ``dataframe`` where columns are state and district. 

1169 

1170 Column filter supports like this: 

1171 

1172 - ``?_c=y&x`` returns df with unique values of y where x is not null 

1173 - ``?_c=y&x=val`` returns df with unique values of y where x == val 

1174 - ``?_c=y&y=val`` returns df with unique values of y, ignores filter y == val 

1175 - ``?_c=y&x>=val`` returns df with unique values of y where x > val 

1176 - ``?_c=x&_c=y&x=val`` returns df with unique values of x ignoring filter x == val 

1177 and returns unique values of y where x == val 

1178 

1179 Arguments are converted to the type of the column before comparing. If this 

1180 fails, it raises a ValueError. 

1181 

1182 These URL query parameters control the output: 

1183 

1184 - ``?_sort=col`` sorts column col in ascending order. ``?_sort=-col`` sorts 

1185 in descending order. 

1186 - ``?_limit=100`` limits the result to 100 rows 

1187 - ``?_offset=100`` starts showing the result from row 100. Default: 0 

1188 - ``?_c=x&_c=y`` returns only columns ``[x, y]``. ``?_c=-col`` drops col. 

1189 

1190 If a column name matches one of the above, you cannot filter by that column. 

1191 Avoid column names beginning with _. 

1192 

1193 To get additional information about the filtering, use:: 

1194 

1195 meta = {} # Create a variable which will be filled with more info 

1196 filtered = gramex.data.filter(data, meta=meta, **handler.args) 

1197 

1198 The ``meta`` variable is populated with the following keys: 

1199 

1200 - ``filters``: Applied filters as ``[(col, op, val), ...]`` 

1201 - ``ignored``: Ignored filters as ``[(col, vals), ('_sort', cols), ...]`` 

1202 - ``excluded``: Excluded columns as ``[col, ...]`` 

1203 - ``sort``: Sorted columns as ``[(col, True), ...]``. The second parameter is ``ascending=`` 

1204 - ``offset``: Offset as integer. Defaults to 0 

1205 - ``limit``: Limit as integer - ``100`` if limit is not applied 

1206 - ``count``: Total number of rows, if available 

1207 

1208 These variables may be useful to show additional information about the 

1209 filtered data. 

1210 ''' 

1211 # Auto-detect engine. 

1212 if engine is None: 1212 ↛ 1214line 1212 didn't jump to line 1214, because the condition on line 1212 was never false

1213 engine = get_engine(url) 

1214 result = {} 

1215 limit = args.get('_limit', [100]) 

1216 try: 

1217 limit = min(int(v) for v in limit) 

1218 except ValueError: 

1219 raise ValueError('_limit not integer: %r' % limit) 

1220 for col in args.get('_c', []): 

1221 # col_args takes _sort, _c and all filters from args 

1222 col_args = {} 

1223 for key, value in args.items(): 

1224 if key in ['_sort']: 

1225 col_args[key] = value 

1226 # Ignore any filters on the column we are currently processing 

1227 if not key.startswith('_') and key != col: 

1228 col_args[key] = value 

1229 col_args['_by'] = [col] 

1230 col_args['_c'] = [] 

1231 col_args['_limit'] = [limit] 

1232 result[col] = gramex.data.filter(url, table=table, args=col_args, **kwargs) 

1233 return result