Coverage for gramex\apps\nlg\nlgutils.py : 72%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# vim:fenc=utf-8
3"""
4Miscellaneous utilities.
5"""
6import re
8import six
9from tornado.template import Template
11from gramex.data import filter as grmfilter # NOQA: F401
13NP_RULES = {
14 "NP1": [{"POS": "PROPN", "OP": "+"}],
15 "NP2": [{"POS": "NOUN", "OP": "+"}],
16 "NP3": [{"POS": "ADV", "OP": "+"}, {"POS": "VERB", "OP": "+"}],
17 "NP4": [{"POS": "ADJ", "OP": "+"}, {"POS": "VERB", "OP": "+"}],
18 "QUANT": [{"POS": "NUM", "OP": "+"}]
19}
21NARRATIVE_TEMPLATE = """
22{% autoescape None %}
23from nlg import grammar as G
24from nlg import nlgutils as U
25from tornado.template import Template as T
26import pandas as pd
28df = None # set your dataframe here.
29narrative = T(\"\"\"
30 {{ tmpl }}
31 \"\"\").generate(
32 tornado_tmpl=True, orgdf=df, fh_args={{ fh_args }},
33 G=G, U=U)
34print(narrative)
35"""
37_spacy = {
38 'model': False,
39 'lemmatizer': False,
40 'matcher': False
41}
44def load_spacy_model():
45 """Load the spacy model when required."""
46 if not _spacy['model']:
47 from spacy import load
48 nlp = load("en_core_web_sm")
49 _spacy['model'] = nlp
50 else:
51 nlp = _spacy['model']
52 return nlp
55def get_lemmatizer():
56 if not _spacy['lemmatizer']:
57 from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
58 from spacy.lemmatizer import Lemmatizer
59 lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
60 _spacy['lemmatizer'] = lemmatizer
61 else:
62 lemmatizer = _spacy['lemmatizer']
63 return lemmatizer
66def make_np_matcher(nlp, rules=NP_RULES):
67 """Make a rule based noun phrase matcher.
69 Parameters
70 ----------
71 nlp : `spacy.lang`
72 The spacy model to use.
73 rules : dict, optional
74 Mapping of rule IDS to spacy attribute patterns, such that each mapping
75 defines a noun phrase structure.
77 Returns
78 -------
79 `spacy.matcher.Matcher`
80 """
81 if not _spacy['matcher']:
82 from spacy.matcher import Matcher
83 matcher = Matcher(nlp.vocab)
84 for k, v in rules.items():
85 matcher.add(k, None, v)
86 _spacy['matcher'] = matcher
87 else:
88 matcher = _spacy['matcher']
89 return matcher
92def render_search_result(text, results, **kwargs):
93 for token, tokenlist in results.items():
94 tmpl = [t for t in tokenlist if t.get('enabled', False)][0]
95 text = text.replace(token, '{{{{ {} }}}}'.format(tmpl['tmpl']))
96 return Template(text).generate(**kwargs).decode('utf-8')
99def join_words(x, sep=' '):
100 return sep.join(re.findall(r'\w+', x, re.IGNORECASE))
103class set_nlg_gramopt(object): # noqa: class to be used as a decorator
104 """Decorator for adding callables to grammar options of the webapp.
105 """
106 def __init__(self, **kwargs):
107 self.kwargs = kwargs
109 def __call__(self, func):
110 func.gramopt = True
111 for k, v in self.kwargs.items():
112 if not getattr(func, k, False): 112 ↛ 111line 112 didn't jump to line 111, because the condition on line 112 was never false
113 setattr(func, k, v)
114 return func
117def is_overlap(x, y):
118 """Whether the token x is contained within any span in the sequence y."""
119 if "NUM" in [c.pos_ for c in x]:
120 return False
121 return any([x.text in yy for yy in y])
124def unoverlap(tokens):
125 """From a set of tokens, remove all tokens that are contained within
126 others."""
127 textmap = {c.text: c for c in tokens}
128 text_tokens = six.viewkeys(textmap)
129 newtokens = []
130 for token in text_tokens:
131 if not is_overlap(textmap[token], text_tokens - {token}):
132 newtokens.append(token)
133 return [textmap[t] for t in newtokens]
136def ner(doc, matcher, match_ids=False, remove_overlap=True):
137 """Find all NEs and other nouns in a spacy doc.
139 Parameters
140 ----------
141 doc: spacy.tokens.doc.Doc
142 The document in which to search for entities.
143 matcher: spacy.matcher.Matcher
144 The rule based matcher to use for finding noun phrases.
145 match_ids: list, optional
146 IDs from the spacy matcher to filter from the matches.
147 remove_overlap: bool, optional
148 Whether to remove overlapping tokens from the result.
150 Returns
151 -------
152 list
153 List of spacy.token.span.Span objects.
154 """
155 entities = set()
156 for span in doc.ents:
157 newtokens = [c for c in span if not c.is_space]
158 if newtokens: 158 ↛ 156line 158 didn't jump to line 156, because the condition on line 158 was never false
159 newspan = doc[newtokens[0].i: (newtokens[-1].i + 1)]
160 entities.add(newspan)
161 if not match_ids: 161 ↛ 164line 161 didn't jump to line 164, because the condition on line 161 was never false
162 entities.update([doc[start:end] for _, start, end in matcher(doc)])
163 else:
164 for m_id, start, end in matcher(doc):
165 if matcher.vocab.strings[m_id] in match_ids:
166 entities.add(doc[start:end])
167 if remove_overlap: 167 ↛ 169line 167 didn't jump to line 169, because the condition on line 167 was never false
168 entities = unoverlap(entities)
169 return entities
172def sanitize_indices(shape, i, axis=0):
173 n = shape[axis]
174 if i <= n // 2:
175 return i
176 return -(n - i)
179def sanitize_text(text, d_round=2):
180 """All text cleaning and standardization logic goes here."""
181 nums = re.findall(r"\d+\.\d+", text)
182 for num in nums:
183 text = re.sub(num, str(round(float(num), d_round)), text)
184 return text
187def sanitize_df(df, d_round=2, **options):
188 """All dataframe cleaning and standardizing logic goes here."""
189 for c in df.columns[df.dtypes == float]:
190 df[c] = df[c].round(d_round)
191 return df
194def sanitize_fh_args(args, func=join_words):
195 for k, v in args.items():
196 args[k] = [join_words(x) for x in v]
197 return args
200def add_html_styling(template, style):
201 """Add HTML styling spans to template elements.
203 Parameters
204 ----------
205 template : str
206 A tornado template
207 style : dict or bool
208 If False, no styling is added.
209 If True, a default bgcolor is added to template variables.
210 If dict, expected to contain HTML span styling elements.
212 Returns
213 -------
214 str
215 Modified template with each variabled stylized.
217 Example
218 -------
219 >>> t = 'Hello, {{ name }}!'
220 >>> add_html_styling(t, True)
221 'Hello, <span style="background-color:#c8f442">{{ name }}</span>!'
222 >>> add_html_styling(t, False)
223 'Hello, {{ name }}!'
224 >>> add_html_style(t, {'background-color': '#ffffff', 'font-family': 'monospace'})
225 'Hello, <span style="background-color:#c8f442;font-family:monospace">{{ name }}</span>!'
226 """
228 if not style:
229 return template
230 pattern = re.compile(r'\{\{[^\{\}]+\}\}')
231 if isinstance(style, dict):
232 # convert the style dict into a stylized HTML span
233 spanstyle = ";".join(['{}:{}'.format(k, v) for k, v in style.items()])
234 else:
235 spanstyle = "background-color:#c8f442"
236 for m in re.finditer(pattern, template):
237 token = m.group()
238 repl = '<span style="{ss}">{token}</span>'.format(
239 ss=spanstyle, token=token)
240 template = re.sub(re.escape(token), repl, template, 1)
241 return '<p>{template}</p>'.format(template=template)
244# @coroutine
245# def check_grammar(text, lang='en-us'):
246# """Check `text` for grammatical errors.
248# Parameters
249# ----------
250# text : str
251# The text to check.
252# lang : str, optional
253# Language of text.
254# """
255# client = AsyncHTTPClient()
256# query = six.moves.urllib.parse.urlencode({'q': text, 'lang': lang})
257# resp = yield client.fetch("http://localhost:9988/admin/nlg/languagetool/?" + query)
258# raise Return(resp.body)