Coverage for gramex\apps\logviewer\logviewer.py : 77%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import re
2import sys
3import os.path
4import sqlite3
5from glob import glob
6from lxml.etree import Element
7from lxml.html import fromstring, tostring
8import numpy as np
9import pandas as pd
10import gramex.data
11import gramex.cache
12from gramex import conf
13from gramex.config import app_log
14from gramex.transforms import build_transform
16if sys.version_info.major == 3: 16 ↛ 19line 16 didn't jump to line 19
17 unicode = str
19DB_CONFIG = {
20 'table': 'agg{}',
21 'levels': ['M', 'W', 'D'],
22 'dimensions': [{'key': 'time', 'freq': '?level'},
23 'user.id', 'ip', 'status', 'uri'],
24 'metrics': {
25 'duration': ['count', 'sum'],
26 'new_session': ['sum'],
27 'session_time': ['sum']
28 }
29}
30DB_CONFIG['table_columns'] = [
31 '{}_{}'.format(k, x)
32 for k, v in DB_CONFIG['metrics'].items()
33 for x in v] + [
34 x['key'] if isinstance(x, dict) else x
35 for x in DB_CONFIG['dimensions']]
38FOLDER = os.path.dirname(os.path.abspath(__file__))
39CONFIG_FILE = os.path.join(FOLDER, 'config.yaml')
42def pdagg(df, groups, aggfuncs):
43 '''
44 groups = [{'key': 'time', 'freq': 'D'}, 'user.id', 'status', 'uri']
45 aggfuncs = {'duration': ['count', 'mean', namedfunc], 'status': ['count']}
46 '''
47 groups = [pd.Grouper(**g) if isinstance(g, dict) else g for g in groups]
48 grps = df.groupby(groups)
49 dff = grps.agg(aggfuncs)
50 if isinstance(dff.columns, pd.MultiIndex): 50 ↛ 52line 50 didn't jump to line 52, because the condition on line 50 was never false
51 dff.columns = dff.columns.map('_'.join)
52 return dff.reset_index()
55def table_exists(table, conn):
56 '''check if table exists in sqlite db'''
57 query = ("SELECT name FROM sqlite_master "
58 "WHERE type='table' AND name='{}'".format(table))
59 return not pd.read_sql(query, conn).empty
62def add_session(df, duration=30, cutoff_buffer=0):
63 '''add new_session based on `duration` threshold
64 add cutoff_buffer in minutes for first and last session requests
65 '''
66 s = df.groupby('user.id')['time'].diff().dt.total_seconds()
67 flag = s.isnull() | s.ge(duration * 60)
68 df['new_session'] = flag.astype(int)
69 df['session_time'] = np.where(flag, cutoff_buffer * 60, s)
70 return df
73def prepare_logs(df, session_threshold=15, cutoff_buffer=0):
74 '''
75 - removes rows with errors in time, duration, status
76 - sort by time
77 - adds session metrics (new_session, session_time)
78 '''
79 df['time'] = pd.to_datetime(df['time'], unit='ms', errors='coerce')
80 # Ignore pre-2000 year and null/NaT rows
81 df = df[df['time'] > '2000-01-01']
82 for col in ['duration', 'status']:
83 if not np.issubdtype(df[col].dtype, np.number): 83 ↛ 84line 83 didn't jump to line 84, because the condition on line 83 was never true
84 df[col] = pd.to_numeric(df[col], errors='coerce')
85 df = df[df[col].notnull()]
86 # logging via threads may not maintain order
87 df = df.sort_values(by='time')
88 # add new_session
89 df = add_session(df, duration=session_threshold, cutoff_buffer=cutoff_buffer)
90 return df
93def summarize(transforms=[], post_transforms=[], run=True,
94 session_threshold=15, cutoff_buffer=0):
95 '''summarize'''
96 app_log.info('logviewer: Summarize started')
97 levels = DB_CONFIG['levels']
98 table = DB_CONFIG['table'].format
99 # dimensions and metrics to summarize
100 groups = DB_CONFIG['dimensions']
101 aggfuncs = DB_CONFIG['metrics']
102 log_file = conf.log.handlers.requests.filename
103 # Handle for multiple instances requests.csv$LISTENPORT
104 log_file = '{0}{1}'.format(*log_file.partition('.csv'))
105 folder = os.path.dirname(log_file)
106 conn = sqlite3.connect(os.path.join(folder, 'logviewer.db'))
107 # drop agg tables from database
108 if run in ['drop', 'reload']: 108 ↛ 109line 108 didn't jump to line 109, because the condition on line 108 was never true
109 droptable = 'DROP TABLE IF EXISTS {}'.format
110 for freq in levels:
111 app_log.info('logviewer: Dropping {} table'.format(table(freq)))
112 conn.execute(droptable(table(freq)))
113 conn.commit()
114 conn.execute('VACUUM')
115 if run == 'drop':
116 conn.close()
117 return
118 # all log files sorted by modified time
119 log_files = sorted(glob(log_file + '*'), key=os.path.getmtime)
120 max_date = None
122 def filesince(filename, date):
123 match = re.search(r'(\d{4}-\d{2}-\d{2})$', filename)
124 backupdate = match.group() if match else ''
125 return backupdate >= date or backupdate == ''
127 # get this month log files if db is already created
128 if table_exists(table(levels[-1]), conn):
129 max_date = pd.read_sql(
130 'SELECT MAX(time) FROM {}'.format(
131 table(levels[-1])), conn).iloc[0, 0]
132 app_log.info('logviewer: last processed till %s', max_date)
133 this_month = max_date[:8] + '01'
134 log_files = [f for f in log_files if filesince(f, this_month)]
135 max_date = pd.to_datetime(max_date)
137 if not log_files: 137 ↛ 138line 137 didn't jump to line 138, because the condition on line 137 was never true
138 app_log.info('logviewer: no log files to process')
139 return
140 # Create dataframe from log files
141 columns = conf.log.handlers.requests['keys']
142 # TODO: avoid concat?
143 app_log.info('logviewer: files to process %s', log_files)
144 data = pd.concat([
145 pd.read_csv(f, names=columns, encoding='utf-8').fillna('-')
146 for f in log_files
147 ], ignore_index=True)
148 app_log.info(
149 'logviewer: prepare_logs {} rows with {} mint session_threshold'.format(
150 len(data.index), session_threshold))
151 data = prepare_logs(df=data,
152 session_threshold=session_threshold,
153 cutoff_buffer=cutoff_buffer)
154 app_log.info('logviewer: processed and returned {} rows'.format(len(data.index)))
155 # apply transforms on raw data
156 app_log.info('logviewer: applying transforms')
157 for spec in transforms:
158 apply_transform(data, spec) # applies on copy
159 delete = 'DELETE FROM {} WHERE time >= "{}"'.format
160 # levels should go from M > W > D
161 for freq in levels:
162 # filter dataframe for max_date.level
163 if max_date:
164 date_from = max_date
165 if freq == 'W':
166 date_from -= pd.offsets.Day(max_date.weekday())
167 if freq == 'M':
168 date_from -= pd.offsets.MonthBegin(1)
169 data = data[data.time.ge(date_from)]
170 # delete old records
171 conn.execute(delete(table(freq), date_from))
172 conn.commit()
173 groups[0]['freq'] = freq
174 # get summary view
175 app_log.info('logviewer: pdagg for {}'.format(table(freq)))
176 dff = pdagg(data, groups, aggfuncs)
177 # apply post_transforms here
178 app_log.info('logviewer: applying post_transforms')
179 for spec in post_transforms:
180 apply_transform(dff, spec)
181 # insert new records
182 try:
183 dff.to_sql(table(freq), conn, if_exists='append', index=False)
184 # dff columns should match with table columns
185 # if not, call summarize run='reload' to
186 # drop all the tables and rerun the job
187 except sqlite3.OperationalError:
188 app_log.info('logviewer: OperationalError: run: reload')
189 summarize(transforms=transforms, run='reload')
190 return
191 conn.close()
192 app_log.info('logviewer: Summarize completed')
193 return
196def prepare_where(query, args, columns):
197 '''prepare where clause'''
198 wheres = []
199 for key, vals in args.items():
200 col, agg, op = gramex.data._filter_col(key, columns)
201 if col not in columns:
202 continue
203 if op == '': 203 ↛ 204line 203 didn't jump to line 204, because the condition on line 203 was never true
204 wheres.append('"{}" IN ("{}")'.format(col, '", "'.join(vals)))
205 elif op == '!': 205 ↛ 206line 205 didn't jump to line 206, because the condition on line 205 was never true
206 wheres.append('"{}" NOT IN ("{}")'.format(col, '", "'.join(vals)))
207 elif op == '>': 207 ↛ 209line 207 didn't jump to line 209, because the condition on line 207 was never false
208 wheres.append('"{}" > "{}"'.format(col, min(vals)))
209 elif op == '>~':
210 wheres.append('"{}" >= "{}"'.format(col, min(vals)))
211 elif op == '<':
212 wheres.append('"{}" < "{}"'.format(col, max(vals)))
213 elif op == '<~':
214 wheres.append('"{}" <= "{}"'.format(col, max(vals)))
215 elif op == '~':
216 q = ' OR '.join('"{}" LIKE "%{}%"'.format(col, x) for x in vals)
217 wheres.append('({})'.format(q))
218 elif op == '!~':
219 q = ' OR '.join('"{}" NOT LIKE "%{}%"'.format(col, x) for x in vals)
220 wheres.append('({})'.format(q))
221 wheres = ' AND '.join(wheres)
222 if not wheres:
223 return wheres
224 prepend = 'WHERE ' if ' WHERE ' not in query else 'AND '
225 wheres = prepend + wheres
226 return wheres
229def query(handler, args):
230 '''queries for logviewer'''
231 queries = handler.kwargs.kwargs.queries
232 table = handler.path_kwargs.get('table')
233 case = handler.path_kwargs.get('query')
234 query = queries.get(case)
235 wheres = prepare_where(query, args, DB_CONFIG['table_columns'])
236 stmt = query.format(table=table, where=wheres)
237 return stmt
240def apply_transform(data, spec):
241 '''apply transform on dataframe'''
242 pandas_transforms = {
243 'REPLACE': pd.Series.replace,
244 'MAP': pd.Series.map,
245 'IN': pd.Series.isin,
246 'NOTIN': lambda s, v: ~s.isin(v),
247 'CONTAINS': {
248 'function': lambda s, v, **ops: s.str.contains(v, **ops),
249 'defaults': {'case': False}
250 },
251 'NOTCONTAINS': {
252 'function': lambda s, v, **ops: ~s.str.contains(v, **ops),
253 'defaults': {'case': False}
254 },
255 'LEN': lambda s, _: s.str.len(),
256 'LOWER': lambda s, _: s.str.lower(),
257 'UPPER': lambda s, _: s.str.upper(),
258 'PROPER': lambda s, _: s.str.capitalize(),
259 'STARTSWITH': lambda s, v: s.str.startswith(v),
260 'ENDSWITH': lambda s, v: s.str.endswith(v)
261 }
262 # TODO: STRREPLACE
263 if spec['type'] == 'function': 263 ↛ 264line 263 didn't jump to line 264, because the condition on line 263 was never true
264 fn = build_transform(
265 {'function': spec['expr']}, vars={'data': None},
266 filename='lv: %s' % spec.get('name'))
267 fn(data) # applies on copy
268 return data
269 expr = spec['expr']
270 func = pandas_transforms[expr['op']]
271 kwargs = expr.get('kwargs', {})
272 if isinstance(func, dict):
273 # use defaults' kwargs if not present in expr.get
274 for key, val in func.get('defaults', {}).items():
275 if key not in kwargs: 275 ↛ 274line 275 didn't jump to line 274, because the condition on line 275 was never false
276 kwargs[key] = val
277 func = func['function']
278 data[spec['as']] = func(data[expr['col']], expr.get('value'), **kwargs)
279 return data
282def get_config(handler=None):
283 '''return config as dict'''
284 file_path = handler.kwargs.get('path_ui', CONFIG_FILE)
285 return gramex.cache.open(file_path, 'config')
288def load_component(page, **kwargs):
289 '''return generateed template'''
290 return gramex.cache.open(page, 'template', rel=True).generate(**kwargs)
293def load_layout(config):
294 '''return generated layout'''
295 return tostring(eltree(config, root=Element('root')))[6:-7]
298def eltree(data, root=None):
299 '''Convert dict to etree.Element(s)'''
300 attr_prefix = '@'
301 text_key = '$'
302 tpl_key = '_$'
303 result = [] if root is None else root
304 if isinstance(data, dict): 304 ↛ 333line 304 didn't jump to line 333, because the condition on line 304 was never false
305 for key, value in data.items():
306 if root is not None: 306 ↛ 324line 306 didn't jump to line 324, because the condition on line 306 was never false
307 # attribute prefixes
308 if key.startswith(attr_prefix):
309 key = key.lstrip(attr_prefix)
310 result.set(key, unicode(value))
311 continue
312 # text content
313 if key == text_key: 313 ↛ 314line 313 didn't jump to line 314, because the condition on line 313 was never true
314 result.text = unicode(value)
315 continue
316 # template hooks
317 if key == tpl_key:
318 for tpl in value if isinstance(value, list) else [value]:
319 template = '{}.html'.format(tpl['tpl'])
320 raw_node = load_component(template, values=tpl.get('values', tpl))
321 result.append(fromstring(raw_node))
322 continue
323 # add other keys as children
324 values = value if isinstance(value, list) else [value]
325 for value in values:
326 elem = Element(key)
327 result.append(elem)
328 # scalars to text
329 if not isinstance(value, (dict, list)): 329 ↛ 330line 329 didn't jump to line 330, because the condition on line 329 was never true
330 value = {text_key: value}
331 eltree(value, root=elem)
332 else:
333 result.append(Element(unicode(data)))
334 return result