Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# vim:fenc=utf-8 

2 

3""" 

4Search tools. 

5""" 

6 

7from itertools import chain 

8 

9import numpy as np 

10import pandas as pd 

11import six 

12 

13from gramex.apps.nlg import nlgutils as utils 

14from gramex.apps.nlg.grammar import find_inflections 

15 

16SEARCH_PRIORITIES = [ 

17 {'type': 'ne'}, # A match which is a named entity gets the higest priority 

18 {'location': 'fh_args'}, # than one that is a formhandler arg 

19 {'location': 'colname'}, # than one that is a column name 

20 {'type': 'quant'}, # etc 

21 {'location': 'cell'} 

22] 

23 

24 

25def _sort_search_results(items, priorities=SEARCH_PRIORITIES): 

26 """ 

27 Sort a list of search results by `priorities`. 

28 

29 Parameters 

30 ---------- 

31 items : dict 

32 Dictionary containing search results, where keys are tokens and values 

33 are lists of locations where the token was found. Preferably this should 

34 be a `DFSearchResults` object. 

35 priorities : list, optional 

36 List of rules that allow sorting of search results. A `rule` is any 

37 subset of a search result dictionary. Lower indices indicate higher priorities. 

38 

39 Returns 

40 ------- 

41 dict 

42 Prioritized search results - for each {token: search_matches} pair, sort 

43 search_matches such that a higher priority search result is enabled. 

44 """ 

45 match_ix = [[six.viewitems(p) <= six.viewitems(item) for p in priorities] for item in items] 

46 min_match = [m.index(True) for m in match_ix] 

47 items[min_match.index(min(min_match))]['enabled'] = True 

48 return items 

49 

50 

51class DFSearchResults(dict): 

52 """A convenience wrapper around `dict` to collect search results. 

53 

54 Different from `dict` in that values are always lists, and setting to 

55 existing key appends to the list.""" 

56 

57 def __setitem__(self, key, value): 

58 if key not in self: 

59 super(DFSearchResults, self).__setitem__(key, [value]) 

60 elif self[key][0] != value: 

61 self[key].append(value) 

62 

63 def update(self, other): 

64 # Needed because the default update method doesn't seem to use setitem 

65 for k, v in other.items(): 

66 self[k] = v 

67 

68 def clean(self): 

69 """Sort the search results for each token by priority and un-overlap tokens.""" 

70 for k, v in self.items(): 

71 _sort_search_results(v) 

72 # unoverlap the keys 

73 to_remove = [] 

74 for k in self: 

75 if any([k in c for c in six.viewkeys(self) - {k}]): 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true

76 to_remove.append(k) 

77 for i in to_remove: 77 ↛ 78line 77 didn't jump to line 78, because the loop on line 77 never started

78 del self[i] 

79 

80 

81class DFSearch(object): 

82 """Make a dataframe searchable.""" 

83 

84 def __init__(self, df, nlp=None, **kwargs): 

85 """Default constrictor. 

86 

87 Parameters 

88 ---------- 

89 df : pd.DataFrame 

90 The dataframe to search. 

91 nlp : A `spacy.lang` model, optional 

92 """ 

93 self.df = df 

94 # What do results contain? 

95 # A map of tokens to list of search results. 

96 self.results = DFSearchResults() 

97 if not nlp: 97 ↛ 99line 97 didn't jump to line 99, because the condition on line 97 was never false

98 nlp = utils.load_spacy_model() 

99 self.nlp = nlp 

100 self.matcher = kwargs.get('matcher', utils.make_np_matcher(self.nlp)) 

101 

102 def search(self, text, colname_fmt="df.columns[{}]", 

103 cell_fmt="df['{}'].iloc[{}]", **kwargs): 

104 """ 

105 Search the dataframe. 

106 

107 Parameters 

108 ---------- 

109 text : str 

110 The text to search. 

111 colname_fmt : str, optional 

112 String format to describe dataframe columns in the search results, 

113 can be one of "df.columns[{}]" or "df[{}]". 

114 cell_fmt : str, optional 

115 String format to describe dataframe values in the search results. 

116 Can be one of "df.iloc[{}, {}]", "df.loc[{}, {}]", "df[{}][{}]", etc. 

117 

118 Returns 

119 ------- 

120 dict 

121 A dictionary who's keys are tokens from `text` found in 

122 the source dataframe, and values are a list of locations in the df 

123 where they are found. 

124 """ 

125 self.search_nes(text) 

126 for token, ix in self.search_columns(text, **kwargs).items(): 

127 ix = utils.sanitize_indices(self.df.shape, ix, 1) 

128 self.results[token] = {'location': 'colname', 'tmpl': colname_fmt.format(ix), 

129 'type': 'token'} 

130 

131 for token, (x, y) in self.search_table(text, **kwargs).items(): 

132 x = utils.sanitize_indices(self.df.shape, x, 0) 

133 y = utils.sanitize_indices(self.df.shape, y, 1) 

134 self.results[token] = { 

135 'location': "cell", 'tmpl': cell_fmt.format(self.df.columns[y], x), 

136 'type': 'token'} 

137 self.search_quant([c.text for c in self.doc if c.pos_ == 'NUM']) 

138 return self.results 

139 

140 def search_nes(self, text, colname_fmt="df.columns[{}]", cell_fmt="df['{}'].iloc[{}]"): 

141 """Find named entities in text, and search for them in the dataframe. 

142 

143 Parameters 

144 ---------- 

145 text : str 

146 The text to search. 

147 """ 

148 self.doc = self.nlp(text) 

149 self.ents = utils.ner(self.doc, self.matcher) 

150 ents = [c.text for c in self.ents] 

151 for token, ix in self.search_columns(ents, literal=True).items(): 

152 ix = utils.sanitize_indices(self.df.shape, ix, 1) 

153 self.results[token] = { 

154 'location': "colname", 

155 'tmpl': colname_fmt.format(ix), 'type': 'ne' 

156 } 

157 for token, (x, y) in self.search_table(ents, literal=True).items(): 

158 x = utils.sanitize_indices(self.df.shape, x, 0) 

159 y = utils.sanitize_indices(self.df.shape, y, 1) 

160 self.results[token] = { 

161 'location': "cell", 

162 'tmpl': cell_fmt.format(self.df.columns[y], x), 'type': 'ne'} 

163 

164 def search_table(self, text, **kwargs): 

165 """Search the `.values` attribute of the dataframe for tokens in `text`.""" 

166 kwargs['array'] = self.df.copy() 

167 return self._search_array(text, **kwargs) 

168 

169 def search_columns(self, text, **kwargs): 

170 """Search df columns for tokens in `text`.""" 

171 kwargs['array'] = self.df.columns 

172 return self._search_array(text, **kwargs) 

173 

174 def search_quant(self, quants, nround=2, cell_fmt="df['{}'].iloc[{}]"): 

175 """Search the dataframe for a set of quantitative values. 

176 

177 Parameters 

178 ---------- 

179 quants : list / array like 

180 The values to search. 

181 nround : int, optional 

182 Numeric values in the dataframe are rounded to these many 

183 significant digits before searching. 

184 """ 

185 dfclean = utils.sanitize_df(self.df, nround) 

186 quants = np.array(quants) 

187 n_quant = quants.astype('float').round(2) 

188 for x, y in zip(*dfclean.isin(n_quant).values.nonzero()): 

189 x = utils.sanitize_indices(dfclean.shape, x, 0) 

190 y = utils.sanitize_indices(dfclean.shape, y, 1) 

191 tk = quants[n_quant == dfclean.iloc[x, y]][0].item() 

192 self.results[tk] = { 

193 'location': "cell", 'tmpl': cell_fmt.format(self.df.columns[y], x), 

194 'type': 'quant'} 

195 

196 def _search_array(self, text, array, literal=False, 

197 case=False, lemmatize=True, nround=2): 

198 """Search for tokens in text within an array. 

199 

200 Parameters 

201 ---------- 

202 text : str or spacy document 

203 Text to search 

204 array : array-like 

205 Array to search in. 

206 literal : bool, optional 

207 Whether to match tokens to values literally. 

208 case : bool, optional 

209 If true, run a case sensitive search. 

210 lemmatize : bool, optional 

211 If true (default), search on lemmas of tokens and values. 

212 nround : int, optional 

213 Significant digits used to round `array` before searching. 

214 

215 Returns 

216 ------- 

217 dict 

218 Mapping of tokens to a sequence of indices within `array`. 

219 

220 Example 

221 ------- 

222 >>> _search_array('3', np.arange(5)) 

223 {'3': [2]} 

224 >>> df = pd.DataFrame(np.eye(3), columns='one punch man'.split()) 

225 >>> _search_array('1', df.values) 

226 {'1': [(0, 0), (1, 1), (2, 2)]} 

227 >>> _search_array('punched man', df.columns) 

228 {'punched': [1], 'man': [2]} 

229 >>> _search_array('1 2 buckle my shoe', df.index) 

230 {'1': [1], '2': [2]} 

231 """ 

232 if literal: 

233 # Expect text to be a list of strings, no preprocessing on anything. 

234 if not isinstance(text, list): 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true

235 raise TypeError('text is expected to be list of strs when literal=True.') 

236 valid_types = {float, int, six.text_type} 

237 if not set([type(c) for c in text]).issubset(valid_types): 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true

238 raise TypeError('text can contain only strings or numbers when literal=True.') 

239 tokens = {c: str(c) for c in text} 

240 elif lemmatize: 

241 tokens = {c.lemma_: c.text for c in self.nlp(text)} 

242 if array.ndim == 1: 

243 array = [c if isinstance(c, six.text_type) else six.u(c) for c in array] 

244 array = [self.nlp(c) for c in array] 

245 array = pd.Series([token.lemma_ for doc in array for token in doc]) 

246 else: 

247 for col in array.columns[array.dtypes == np.dtype('O')]: 

248 s = [c if isinstance(c, six.text_type) else six.u(c) for c in array[col]] 

249 s = [self.nlp(c) for c in s] 

250 try: 

251 array[col] = [token.lemma_ for doc in s for token in doc] 

252 except ValueError: 

253 # You cannot lemmatize columns that have multi-word values 

254 if not case: # still need to respect the `case` param 254 ↛ 247line 254 didn't jump to line 247, because the condition on line 254 was never false

255 array[col] = array[col].str.lower() 

256 else: 

257 if not case: 

258 tokens = {c.text.lower(): c.text for c in self.nlp(text)} 

259 if array.ndim == 1: 259 ↛ 262line 259 didn't jump to line 262, because the condition on line 259 was never false

260 array = array.str.lower() 

261 else: 

262 for col in array.columns[array.dtypes == np.dtype('O')]: 

263 array[col] = array[col].str.lower() 

264 else: 

265 tokens = {c.text: c.text for c in self.nlp(text)} 

266 mask = array.isin(tokens.keys()) 

267 if mask.ndim == 1: 

268 if mask.any(): 

269 ix = mask.nonzero()[0] 

270 return {tokens[array[i]]: i for i in ix} 

271 return {} 

272 else: 

273 if mask.any().any(): 

274 ix, iy = mask.values.nonzero() 

275 return {tokens[array.iloc[x, y]]: (x, y) for x, y in zip(ix, iy)} 

276 return {} 

277 

278 

279def search_args(entities, args, lemmatized=True, fmt="fh_args['{}'][{}]", 

280 argkeys=('_sort', '_by', '_c')): 

281 """ 

282 Search formhandler arguments, as parsed by g1, for a set of tokens. 

283 

284 Parameters 

285 ---------- 

286 entities : list 

287 list of named entities found in the source text 

288 args : dict 

289 FormHandler args as parsed by g1.url.parse(...).searchList 

290 lemmatized : bool, optional 

291 whether to search on lemmas of text values 

292 fmt : str, optional 

293 String format used to describe FormHandler arguments in the template 

294 argkeys : list, optional 

295 Formhandler argument keys to be considered for the search. Any key not 

296 present in this will be ignored. 

297 # TODO: Column names can be keys too!! 

298 

299 Returns 

300 ------- 

301 dict 

302 Mapping of entities / tokens to objects describing where they are found 

303 in Formhandler arguemnts. Each search result object has the following 

304 structure: 

305 { 

306 "type": "some token", 

307 "location": "fh_args", 

308 "tmpl": "fh_args['_by'][0]" # The template that gets this token from fh_args 

309 } 

310 """ 

311 nlp = utils.load_spacy_model() 

312 args = {k: v for k, v in args.items() if k in argkeys} 

313 search_res = {} 

314 ent_tokens = list(chain(*entities)) 

315 for k, v in args.items(): 

316 v = [t.lstrip('-') for t in v] 

317 # argtokens = list(chain(*[re.findall(r"\w+", f) for f in v])) 

318 argtokens = list(chain(*[nlp(c) for c in v])) 

319 for i, x in enumerate(argtokens): 

320 for y in ent_tokens: 

321 if lemmatized: 

322 if x.lemma_ == y.lemma_: 

323 search_res[y.text] = { 

324 'type': 'token', 'tmpl': fmt.format(k, i), 

325 'location': 'fh_args'} 

326 else: 

327 if x.text == y.text: 

328 search_res[y.text] = { 

329 'type': 'token', 'tmpl': fmt.format(k, i), 

330 'location': 'fh_args'} 

331 return search_res 

332 

333 

334def templatize(text, args, df): 

335 """Construct a tornado template which regenerates some 

336 text from a dataframe and formhandler arguments. 

337 

338 The pipeline consists of: 

339 1. cleaning the text and the dataframe 

340 2. searching the dataframe and FH args for tokens in the text 

341 3. detecting inflections on the tokens. 

342 

343 Parameters 

344 ---------- 

345 text : str 

346 Input text 

347 args : dict 

348 Formhandler arguments 

349 df : pd.DataFrame 

350 Source dataframe. 

351 

352 Returns 

353 -------- 

354 tuple 

355 of search results, cleaned text and token inflections. The webapp uses 

356 these to construct a tornado template. 

357 """ 

358 text = six.u(text) 

359 args = {six.u(k): [six.u(c) for c in v] for k, v in args.items()} 

360 utils.load_spacy_model() 

361 clean_text = utils.sanitize_text(text) 

362 args = utils.sanitize_fh_args(args) 

363 dfs = DFSearch(df) 

364 dfix = dfs.search(clean_text) 

365 dfix.update(search_args(dfs.ents, args)) 

366 dfix.clean() 

367 inflections = find_inflections(clean_text, dfix, args, df) 

368 _infl = {} 

369 for token, funcs in inflections.items(): 

370 _infl[token] = [] 

371 for func in funcs: 

372 _infl[token].append({ 

373 'source': func.source, 

374 'fe_name': func.fe_name, 

375 'func_name': func.__name__ 

376 }) 

377 return dfix, clean_text, _infl