Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import re 

2import sys 

3import os.path 

4import sqlite3 

5from glob import glob 

6from lxml.etree import Element 

7from lxml.html import fromstring, tostring 

8import numpy as np 

9import pandas as pd 

10import gramex.data 

11import gramex.cache 

12from gramex import conf 

13from gramex.config import app_log 

14from gramex.transforms import build_transform 

15 

16if sys.version_info.major == 3: 16 ↛ 19line 16 didn't jump to line 19

17 unicode = str 

18 

19DB_CONFIG = { 

20 'table': 'agg{}', 

21 'levels': ['M', 'W', 'D'], 

22 'dimensions': [{'key': 'time', 'freq': '?level'}, 

23 'user.id', 'ip', 'status', 'uri'], 

24 'metrics': { 

25 'duration': ['count', 'sum'], 

26 'new_session': ['sum'], 

27 'session_time': ['sum'] 

28 } 

29} 

30DB_CONFIG['table_columns'] = [ 

31 '{}_{}'.format(k, x) 

32 for k, v in DB_CONFIG['metrics'].items() 

33 for x in v] + [ 

34 x['key'] if isinstance(x, dict) else x 

35 for x in DB_CONFIG['dimensions']] 

36 

37 

38FOLDER = os.path.dirname(os.path.abspath(__file__)) 

39CONFIG_FILE = os.path.join(FOLDER, 'config.yaml') 

40 

41 

42def pdagg(df, groups, aggfuncs): 

43 ''' 

44 groups = [{'key': 'time', 'freq': 'D'}, 'user.id', 'status', 'uri'] 

45 aggfuncs = {'duration': ['count', 'mean', namedfunc], 'status': ['count']} 

46 ''' 

47 groups = [pd.Grouper(**g) if isinstance(g, dict) else g for g in groups] 

48 grps = df.groupby(groups) 

49 dff = grps.agg(aggfuncs) 

50 if isinstance(dff.columns, pd.MultiIndex): 50 ↛ 52line 50 didn't jump to line 52, because the condition on line 50 was never false

51 dff.columns = dff.columns.map('_'.join) 

52 return dff.reset_index() 

53 

54 

55def table_exists(table, conn): 

56 '''check if table exists in sqlite db''' 

57 query = ("SELECT name FROM sqlite_master " 

58 "WHERE type='table' AND name='{}'".format(table)) 

59 return not pd.read_sql(query, conn).empty 

60 

61 

62def add_session(df, duration=30, cutoff_buffer=0): 

63 '''add new_session based on `duration` threshold 

64 add cutoff_buffer in minutes for first and last session requests 

65 ''' 

66 s = df.groupby('user.id')['time'].diff().dt.total_seconds() 

67 flag = s.isnull() | s.ge(duration * 60) 

68 df['new_session'] = flag.astype(int) 

69 df['session_time'] = np.where(flag, cutoff_buffer * 60, s) 

70 return df 

71 

72 

73def prepare_logs(df, session_threshold=15, cutoff_buffer=0): 

74 ''' 

75 - removes rows with errors in time, duration, status 

76 - sort by time 

77 - adds session metrics (new_session, session_time) 

78 ''' 

79 df['time'] = pd.to_datetime(df['time'], unit='ms', errors='coerce') 

80 # Ignore pre-2000 year and null/NaT rows 

81 df = df[df['time'] > '2000-01-01'] 

82 for col in ['duration', 'status']: 

83 if not np.issubdtype(df[col].dtype, np.number): 83 ↛ 84line 83 didn't jump to line 84, because the condition on line 83 was never true

84 df[col] = pd.to_numeric(df[col], errors='coerce') 

85 df = df[df[col].notnull()] 

86 # logging via threads may not maintain order 

87 df = df.sort_values(by='time') 

88 # add new_session 

89 df = add_session(df, duration=session_threshold, cutoff_buffer=cutoff_buffer) 

90 return df 

91 

92 

93def summarize(transforms=[], post_transforms=[], run=True, 

94 session_threshold=15, cutoff_buffer=0): 

95 '''summarize''' 

96 app_log.info('logviewer: Summarize started') 

97 levels = DB_CONFIG['levels'] 

98 table = DB_CONFIG['table'].format 

99 # dimensions and metrics to summarize 

100 groups = DB_CONFIG['dimensions'] 

101 aggfuncs = DB_CONFIG['metrics'] 

102 log_file = conf.log.handlers.requests.filename 

103 # Handle for multiple instances requests.csv$LISTENPORT 

104 log_file = '{0}{1}'.format(*log_file.partition('.csv')) 

105 folder = os.path.dirname(log_file) 

106 conn = sqlite3.connect(os.path.join(folder, 'logviewer.db')) 

107 # drop agg tables from database 

108 if run in ['drop', 'reload']: 108 ↛ 109line 108 didn't jump to line 109, because the condition on line 108 was never true

109 droptable = 'DROP TABLE IF EXISTS {}'.format 

110 for freq in levels: 

111 app_log.info('logviewer: Dropping {} table'.format(table(freq))) 

112 conn.execute(droptable(table(freq))) 

113 conn.commit() 

114 conn.execute('VACUUM') 

115 if run == 'drop': 

116 conn.close() 

117 return 

118 # all log files sorted by modified time 

119 log_files = sorted(glob(log_file + '*'), key=os.path.getmtime) 

120 max_date = None 

121 

122 def filesince(filename, date): 

123 match = re.search(r'(\d{4}-\d{2}-\d{2})$', filename) 

124 backupdate = match.group() if match else '' 

125 return backupdate >= date or backupdate == '' 

126 

127 # get this month log files if db is already created 

128 if table_exists(table(levels[-1]), conn): 

129 max_date = pd.read_sql( 

130 'SELECT MAX(time) FROM {}'.format( 

131 table(levels[-1])), conn).iloc[0, 0] 

132 app_log.info('logviewer: last processed till %s', max_date) 

133 this_month = max_date[:8] + '01' 

134 log_files = [f for f in log_files if filesince(f, this_month)] 

135 max_date = pd.to_datetime(max_date) 

136 

137 if not log_files: 137 ↛ 138line 137 didn't jump to line 138, because the condition on line 137 was never true

138 app_log.info('logviewer: no log files to process') 

139 return 

140 # Create dataframe from log files 

141 columns = conf.log.handlers.requests['keys'] 

142 # TODO: avoid concat? 

143 app_log.info('logviewer: files to process %s', log_files) 

144 data = pd.concat([ 

145 pd.read_csv(f, names=columns, encoding='utf-8').fillna('-') 

146 for f in log_files 

147 ], ignore_index=True) 

148 app_log.info( 

149 'logviewer: prepare_logs {} rows with {} mint session_threshold'.format( 

150 len(data.index), session_threshold)) 

151 data = prepare_logs(df=data, 

152 session_threshold=session_threshold, 

153 cutoff_buffer=cutoff_buffer) 

154 app_log.info('logviewer: processed and returned {} rows'.format(len(data.index))) 

155 # apply transforms on raw data 

156 app_log.info('logviewer: applying transforms') 

157 for spec in transforms: 

158 apply_transform(data, spec) # applies on copy 

159 delete = 'DELETE FROM {} WHERE time >= "{}"'.format 

160 # levels should go from M > W > D 

161 for freq in levels: 

162 # filter dataframe for max_date.level 

163 if max_date: 

164 date_from = max_date 

165 if freq == 'W': 

166 date_from -= pd.offsets.Day(max_date.weekday()) 

167 if freq == 'M': 

168 date_from -= pd.offsets.MonthBegin(1) 

169 data = data[data.time.ge(date_from)] 

170 # delete old records 

171 conn.execute(delete(table(freq), date_from)) 

172 conn.commit() 

173 groups[0]['freq'] = freq 

174 # get summary view 

175 app_log.info('logviewer: pdagg for {}'.format(table(freq))) 

176 dff = pdagg(data, groups, aggfuncs) 

177 # apply post_transforms here 

178 app_log.info('logviewer: applying post_transforms') 

179 for spec in post_transforms: 

180 apply_transform(dff, spec) 

181 # insert new records 

182 try: 

183 dff.to_sql(table(freq), conn, if_exists='append', index=False) 

184 # dff columns should match with table columns 

185 # if not, call summarize run='reload' to 

186 # drop all the tables and rerun the job 

187 except sqlite3.OperationalError: 

188 app_log.info('logviewer: OperationalError: run: reload') 

189 summarize(transforms=transforms, run='reload') 

190 return 

191 conn.close() 

192 app_log.info('logviewer: Summarize completed') 

193 return 

194 

195 

196def prepare_where(query, args, columns): 

197 '''prepare where clause''' 

198 wheres = [] 

199 for key, vals in args.items(): 

200 col, agg, op = gramex.data._filter_col(key, columns) 

201 if col not in columns: 

202 continue 

203 if op == '': 203 ↛ 204line 203 didn't jump to line 204, because the condition on line 203 was never true

204 wheres.append('"{}" IN ("{}")'.format(col, '", "'.join(vals))) 

205 elif op == '!': 205 ↛ 206line 205 didn't jump to line 206, because the condition on line 205 was never true

206 wheres.append('"{}" NOT IN ("{}")'.format(col, '", "'.join(vals))) 

207 elif op == '>': 207 ↛ 209line 207 didn't jump to line 209, because the condition on line 207 was never false

208 wheres.append('"{}" > "{}"'.format(col, min(vals))) 

209 elif op == '>~': 

210 wheres.append('"{}" >= "{}"'.format(col, min(vals))) 

211 elif op == '<': 

212 wheres.append('"{}" < "{}"'.format(col, max(vals))) 

213 elif op == '<~': 

214 wheres.append('"{}" <= "{}"'.format(col, max(vals))) 

215 elif op == '~': 

216 q = ' OR '.join('"{}" LIKE "%{}%"'.format(col, x) for x in vals) 

217 wheres.append('({})'.format(q)) 

218 elif op == '!~': 

219 q = ' OR '.join('"{}" NOT LIKE "%{}%"'.format(col, x) for x in vals) 

220 wheres.append('({})'.format(q)) 

221 wheres = ' AND '.join(wheres) 

222 if not wheres: 

223 return wheres 

224 prepend = 'WHERE ' if ' WHERE ' not in query else 'AND ' 

225 wheres = prepend + wheres 

226 return wheres 

227 

228 

229def query(handler, args): 

230 '''queries for logviewer''' 

231 queries = handler.kwargs.kwargs.queries 

232 table = handler.path_kwargs.get('table') 

233 case = handler.path_kwargs.get('query') 

234 query = queries.get(case) 

235 wheres = prepare_where(query, args, DB_CONFIG['table_columns']) 

236 stmt = query.format(table=table, where=wheres) 

237 return stmt 

238 

239 

240def apply_transform(data, spec): 

241 '''apply transform on dataframe''' 

242 pandas_transforms = { 

243 'REPLACE': pd.Series.replace, 

244 'MAP': pd.Series.map, 

245 'IN': pd.Series.isin, 

246 'NOTIN': lambda s, v: ~s.isin(v), 

247 'CONTAINS': { 

248 'function': lambda s, v, **ops: s.str.contains(v, **ops), 

249 'defaults': {'case': False} 

250 }, 

251 'NOTCONTAINS': { 

252 'function': lambda s, v, **ops: ~s.str.contains(v, **ops), 

253 'defaults': {'case': False} 

254 }, 

255 'LEN': lambda s, _: s.str.len(), 

256 'LOWER': lambda s, _: s.str.lower(), 

257 'UPPER': lambda s, _: s.str.upper(), 

258 'PROPER': lambda s, _: s.str.capitalize(), 

259 'STARTSWITH': lambda s, v: s.str.startswith(v), 

260 'ENDSWITH': lambda s, v: s.str.endswith(v) 

261 } 

262 # TODO: STRREPLACE 

263 if spec['type'] == 'function': 263 ↛ 264line 263 didn't jump to line 264, because the condition on line 263 was never true

264 fn = build_transform( 

265 {'function': spec['expr']}, vars={'data': None}, 

266 filename='lv: %s' % spec.get('name')) 

267 fn(data) # applies on copy 

268 return data 

269 expr = spec['expr'] 

270 func = pandas_transforms[expr['op']] 

271 kwargs = expr.get('kwargs', {}) 

272 if isinstance(func, dict): 

273 # use defaults' kwargs if not present in expr.get 

274 for key, val in func.get('defaults', {}).items(): 

275 if key not in kwargs: 275 ↛ 274line 275 didn't jump to line 274, because the condition on line 275 was never false

276 kwargs[key] = val 

277 func = func['function'] 

278 data[spec['as']] = func(data[expr['col']], expr.get('value'), **kwargs) 

279 return data 

280 

281 

282def get_config(handler=None): 

283 '''return config as dict''' 

284 file_path = handler.kwargs.get('path_ui', CONFIG_FILE) 

285 return gramex.cache.open(file_path, 'config') 

286 

287 

288def load_component(page, **kwargs): 

289 '''return generateed template''' 

290 return gramex.cache.open(page, 'template', rel=True).generate(**kwargs) 

291 

292 

293def load_layout(config): 

294 '''return generated layout''' 

295 return tostring(eltree(config, root=Element('root')))[6:-7] 

296 

297 

298def eltree(data, root=None): 

299 '''Convert dict to etree.Element(s)''' 

300 attr_prefix = '@' 

301 text_key = '$' 

302 tpl_key = '_$' 

303 result = [] if root is None else root 

304 if isinstance(data, dict): 304 ↛ 333line 304 didn't jump to line 333, because the condition on line 304 was never false

305 for key, value in data.items(): 

306 if root is not None: 306 ↛ 324line 306 didn't jump to line 324, because the condition on line 306 was never false

307 # attribute prefixes 

308 if key.startswith(attr_prefix): 

309 key = key.lstrip(attr_prefix) 

310 result.set(key, unicode(value)) 

311 continue 

312 # text content 

313 if key == text_key: 313 ↛ 314line 313 didn't jump to line 314, because the condition on line 313 was never true

314 result.text = unicode(value) 

315 continue 

316 # template hooks 

317 if key == tpl_key: 

318 for tpl in value if isinstance(value, list) else [value]: 

319 template = '{}.html'.format(tpl['tpl']) 

320 raw_node = load_component(template, values=tpl.get('values', tpl)) 

321 result.append(fromstring(raw_node)) 

322 continue 

323 # add other keys as children 

324 values = value if isinstance(value, list) else [value] 

325 for value in values: 

326 elem = Element(key) 

327 result.append(elem) 

328 # scalars to text 

329 if not isinstance(value, (dict, list)): 329 ↛ 330line 329 didn't jump to line 330, because the condition on line 329 was never true

330 value = {text_key: value} 

331 eltree(value, root=elem) 

332 else: 

333 result.append(Element(unicode(data))) 

334 return result