Coverage for gramex\apps\nlg\nlgsearch.py : 94%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# vim:fenc=utf-8
3"""
4Search tools.
5"""
7from itertools import chain
9import numpy as np
10import pandas as pd
11import six
13from gramex.apps.nlg import nlgutils as utils
14from gramex.apps.nlg.grammar import find_inflections
16SEARCH_PRIORITIES = [
17 {'type': 'ne'}, # A match which is a named entity gets the higest priority
18 {'location': 'fh_args'}, # than one that is a formhandler arg
19 {'location': 'colname'}, # than one that is a column name
20 {'type': 'quant'}, # etc
21 {'location': 'cell'}
22]
25def _sort_search_results(items, priorities=SEARCH_PRIORITIES):
26 """
27 Sort a list of search results by `priorities`.
29 Parameters
30 ----------
31 items : dict
32 Dictionary containing search results, where keys are tokens and values
33 are lists of locations where the token was found. Preferably this should
34 be a `DFSearchResults` object.
35 priorities : list, optional
36 List of rules that allow sorting of search results. A `rule` is any
37 subset of a search result dictionary. Lower indices indicate higher priorities.
39 Returns
40 -------
41 dict
42 Prioritized search results - for each {token: search_matches} pair, sort
43 search_matches such that a higher priority search result is enabled.
44 """
45 match_ix = [[six.viewitems(p) <= six.viewitems(item) for p in priorities] for item in items]
46 min_match = [m.index(True) for m in match_ix]
47 items[min_match.index(min(min_match))]['enabled'] = True
48 return items
51class DFSearchResults(dict):
52 """A convenience wrapper around `dict` to collect search results.
54 Different from `dict` in that values are always lists, and setting to
55 existing key appends to the list."""
57 def __setitem__(self, key, value):
58 if key not in self:
59 super(DFSearchResults, self).__setitem__(key, [value])
60 elif self[key][0] != value:
61 self[key].append(value)
63 def update(self, other):
64 # Needed because the default update method doesn't seem to use setitem
65 for k, v in other.items():
66 self[k] = v
68 def clean(self):
69 """Sort the search results for each token by priority and un-overlap tokens."""
70 for k, v in self.items():
71 _sort_search_results(v)
72 # unoverlap the keys
73 to_remove = []
74 for k in self:
75 if any([k in c for c in six.viewkeys(self) - {k}]): 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true
76 to_remove.append(k)
77 for i in to_remove: 77 ↛ 78line 77 didn't jump to line 78, because the loop on line 77 never started
78 del self[i]
81class DFSearch(object):
82 """Make a dataframe searchable."""
84 def __init__(self, df, nlp=None, **kwargs):
85 """Default constrictor.
87 Parameters
88 ----------
89 df : pd.DataFrame
90 The dataframe to search.
91 nlp : A `spacy.lang` model, optional
92 """
93 self.df = df
94 # What do results contain?
95 # A map of tokens to list of search results.
96 self.results = DFSearchResults()
97 if not nlp: 97 ↛ 99line 97 didn't jump to line 99, because the condition on line 97 was never false
98 nlp = utils.load_spacy_model()
99 self.nlp = nlp
100 self.matcher = kwargs.get('matcher', utils.make_np_matcher(self.nlp))
102 def search(self, text, colname_fmt="df.columns[{}]",
103 cell_fmt="df['{}'].iloc[{}]", **kwargs):
104 """
105 Search the dataframe.
107 Parameters
108 ----------
109 text : str
110 The text to search.
111 colname_fmt : str, optional
112 String format to describe dataframe columns in the search results,
113 can be one of "df.columns[{}]" or "df[{}]".
114 cell_fmt : str, optional
115 String format to describe dataframe values in the search results.
116 Can be one of "df.iloc[{}, {}]", "df.loc[{}, {}]", "df[{}][{}]", etc.
118 Returns
119 -------
120 dict
121 A dictionary who's keys are tokens from `text` found in
122 the source dataframe, and values are a list of locations in the df
123 where they are found.
124 """
125 self.search_nes(text)
126 for token, ix in self.search_columns(text, **kwargs).items():
127 ix = utils.sanitize_indices(self.df.shape, ix, 1)
128 self.results[token] = {'location': 'colname', 'tmpl': colname_fmt.format(ix),
129 'type': 'token'}
131 for token, (x, y) in self.search_table(text, **kwargs).items():
132 x = utils.sanitize_indices(self.df.shape, x, 0)
133 y = utils.sanitize_indices(self.df.shape, y, 1)
134 self.results[token] = {
135 'location': "cell", 'tmpl': cell_fmt.format(self.df.columns[y], x),
136 'type': 'token'}
137 self.search_quant([c.text for c in self.doc if c.pos_ == 'NUM'])
138 return self.results
140 def search_nes(self, text, colname_fmt="df.columns[{}]", cell_fmt="df['{}'].iloc[{}]"):
141 """Find named entities in text, and search for them in the dataframe.
143 Parameters
144 ----------
145 text : str
146 The text to search.
147 """
148 self.doc = self.nlp(text)
149 self.ents = utils.ner(self.doc, self.matcher)
150 ents = [c.text for c in self.ents]
151 for token, ix in self.search_columns(ents, literal=True).items():
152 ix = utils.sanitize_indices(self.df.shape, ix, 1)
153 self.results[token] = {
154 'location': "colname",
155 'tmpl': colname_fmt.format(ix), 'type': 'ne'
156 }
157 for token, (x, y) in self.search_table(ents, literal=True).items():
158 x = utils.sanitize_indices(self.df.shape, x, 0)
159 y = utils.sanitize_indices(self.df.shape, y, 1)
160 self.results[token] = {
161 'location': "cell",
162 'tmpl': cell_fmt.format(self.df.columns[y], x), 'type': 'ne'}
164 def search_table(self, text, **kwargs):
165 """Search the `.values` attribute of the dataframe for tokens in `text`."""
166 kwargs['array'] = self.df.copy()
167 return self._search_array(text, **kwargs)
169 def search_columns(self, text, **kwargs):
170 """Search df columns for tokens in `text`."""
171 kwargs['array'] = self.df.columns
172 return self._search_array(text, **kwargs)
174 def search_quant(self, quants, nround=2, cell_fmt="df['{}'].iloc[{}]"):
175 """Search the dataframe for a set of quantitative values.
177 Parameters
178 ----------
179 quants : list / array like
180 The values to search.
181 nround : int, optional
182 Numeric values in the dataframe are rounded to these many
183 significant digits before searching.
184 """
185 dfclean = utils.sanitize_df(self.df, nround)
186 quants = np.array(quants)
187 n_quant = quants.astype('float').round(2)
188 for x, y in zip(*dfclean.isin(n_quant).values.nonzero()):
189 x = utils.sanitize_indices(dfclean.shape, x, 0)
190 y = utils.sanitize_indices(dfclean.shape, y, 1)
191 tk = quants[n_quant == dfclean.iloc[x, y]][0].item()
192 self.results[tk] = {
193 'location': "cell", 'tmpl': cell_fmt.format(self.df.columns[y], x),
194 'type': 'quant'}
196 def _search_array(self, text, array, literal=False,
197 case=False, lemmatize=True, nround=2):
198 """Search for tokens in text within an array.
200 Parameters
201 ----------
202 text : str or spacy document
203 Text to search
204 array : array-like
205 Array to search in.
206 literal : bool, optional
207 Whether to match tokens to values literally.
208 case : bool, optional
209 If true, run a case sensitive search.
210 lemmatize : bool, optional
211 If true (default), search on lemmas of tokens and values.
212 nround : int, optional
213 Significant digits used to round `array` before searching.
215 Returns
216 -------
217 dict
218 Mapping of tokens to a sequence of indices within `array`.
220 Example
221 -------
222 >>> _search_array('3', np.arange(5))
223 {'3': [2]}
224 >>> df = pd.DataFrame(np.eye(3), columns='one punch man'.split())
225 >>> _search_array('1', df.values)
226 {'1': [(0, 0), (1, 1), (2, 2)]}
227 >>> _search_array('punched man', df.columns)
228 {'punched': [1], 'man': [2]}
229 >>> _search_array('1 2 buckle my shoe', df.index)
230 {'1': [1], '2': [2]}
231 """
232 if literal:
233 # Expect text to be a list of strings, no preprocessing on anything.
234 if not isinstance(text, list): 234 ↛ 235line 234 didn't jump to line 235, because the condition on line 234 was never true
235 raise TypeError('text is expected to be list of strs when literal=True.')
236 valid_types = {float, int, six.text_type}
237 if not set([type(c) for c in text]).issubset(valid_types): 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true
238 raise TypeError('text can contain only strings or numbers when literal=True.')
239 tokens = {c: str(c) for c in text}
240 elif lemmatize:
241 tokens = {c.lemma_: c.text for c in self.nlp(text)}
242 if array.ndim == 1:
243 array = [c if isinstance(c, six.text_type) else six.u(c) for c in array]
244 array = [self.nlp(c) for c in array]
245 array = pd.Series([token.lemma_ for doc in array for token in doc])
246 else:
247 for col in array.columns[array.dtypes == np.dtype('O')]:
248 s = [c if isinstance(c, six.text_type) else six.u(c) for c in array[col]]
249 s = [self.nlp(c) for c in s]
250 try:
251 array[col] = [token.lemma_ for doc in s for token in doc]
252 except ValueError:
253 # You cannot lemmatize columns that have multi-word values
254 if not case: # still need to respect the `case` param 254 ↛ 247line 254 didn't jump to line 247, because the condition on line 254 was never false
255 array[col] = array[col].str.lower()
256 else:
257 if not case:
258 tokens = {c.text.lower(): c.text for c in self.nlp(text)}
259 if array.ndim == 1: 259 ↛ 262line 259 didn't jump to line 262, because the condition on line 259 was never false
260 array = array.str.lower()
261 else:
262 for col in array.columns[array.dtypes == np.dtype('O')]:
263 array[col] = array[col].str.lower()
264 else:
265 tokens = {c.text: c.text for c in self.nlp(text)}
266 mask = array.isin(tokens.keys())
267 if mask.ndim == 1:
268 if mask.any():
269 ix = mask.nonzero()[0]
270 return {tokens[array[i]]: i for i in ix}
271 return {}
272 else:
273 if mask.any().any():
274 ix, iy = mask.values.nonzero()
275 return {tokens[array.iloc[x, y]]: (x, y) for x, y in zip(ix, iy)}
276 return {}
279def search_args(entities, args, lemmatized=True, fmt="fh_args['{}'][{}]",
280 argkeys=('_sort', '_by', '_c')):
281 """
282 Search formhandler arguments, as parsed by g1, for a set of tokens.
284 Parameters
285 ----------
286 entities : list
287 list of named entities found in the source text
288 args : dict
289 FormHandler args as parsed by g1.url.parse(...).searchList
290 lemmatized : bool, optional
291 whether to search on lemmas of text values
292 fmt : str, optional
293 String format used to describe FormHandler arguments in the template
294 argkeys : list, optional
295 Formhandler argument keys to be considered for the search. Any key not
296 present in this will be ignored.
297 # TODO: Column names can be keys too!!
299 Returns
300 -------
301 dict
302 Mapping of entities / tokens to objects describing where they are found
303 in Formhandler arguemnts. Each search result object has the following
304 structure:
305 {
306 "type": "some token",
307 "location": "fh_args",
308 "tmpl": "fh_args['_by'][0]" # The template that gets this token from fh_args
309 }
310 """
311 nlp = utils.load_spacy_model()
312 args = {k: v for k, v in args.items() if k in argkeys}
313 search_res = {}
314 ent_tokens = list(chain(*entities))
315 for k, v in args.items():
316 v = [t.lstrip('-') for t in v]
317 # argtokens = list(chain(*[re.findall(r"\w+", f) for f in v]))
318 argtokens = list(chain(*[nlp(c) for c in v]))
319 for i, x in enumerate(argtokens):
320 for y in ent_tokens:
321 if lemmatized:
322 if x.lemma_ == y.lemma_:
323 search_res[y.text] = {
324 'type': 'token', 'tmpl': fmt.format(k, i),
325 'location': 'fh_args'}
326 else:
327 if x.text == y.text:
328 search_res[y.text] = {
329 'type': 'token', 'tmpl': fmt.format(k, i),
330 'location': 'fh_args'}
331 return search_res
334def templatize(text, args, df):
335 """Construct a tornado template which regenerates some
336 text from a dataframe and formhandler arguments.
338 The pipeline consists of:
339 1. cleaning the text and the dataframe
340 2. searching the dataframe and FH args for tokens in the text
341 3. detecting inflections on the tokens.
343 Parameters
344 ----------
345 text : str
346 Input text
347 args : dict
348 Formhandler arguments
349 df : pd.DataFrame
350 Source dataframe.
352 Returns
353 --------
354 tuple
355 of search results, cleaned text and token inflections. The webapp uses
356 these to construct a tornado template.
357 """
358 text = six.u(text)
359 args = {six.u(k): [six.u(c) for c in v] for k, v in args.items()}
360 utils.load_spacy_model()
361 clean_text = utils.sanitize_text(text)
362 args = utils.sanitize_fh_args(args)
363 dfs = DFSearch(df)
364 dfix = dfs.search(clean_text)
365 dfix.update(search_args(dfs.ents, args))
366 dfix.clean()
367 inflections = find_inflections(clean_text, dfix, args, df)
368 _infl = {}
369 for token, funcs in inflections.items():
370 _infl[token] = []
371 for func in funcs:
372 _infl[token].append({
373 'source': func.source,
374 'fe_name': func.fe_name,
375 'func_name': func.__name__
376 })
377 return dfix, clean_text, _infl