Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# vim:fenc=utf-8 

2 

3""" 

4Miscellaneous utilities. 

5""" 

6import re 

7 

8import six 

9from tornado.template import Template 

10 

11from gramex.data import filter as grmfilter # NOQA: F401 

12 

13NP_RULES = { 

14 "NP1": [{"POS": "PROPN", "OP": "+"}], 

15 "NP2": [{"POS": "NOUN", "OP": "+"}], 

16 "NP3": [{"POS": "ADV", "OP": "+"}, {"POS": "VERB", "OP": "+"}], 

17 "NP4": [{"POS": "ADJ", "OP": "+"}, {"POS": "VERB", "OP": "+"}], 

18 "QUANT": [{"POS": "NUM", "OP": "+"}] 

19} 

20 

21NARRATIVE_TEMPLATE = """ 

22{% autoescape None %} 

23from nlg import grammar as G 

24from nlg import nlgutils as U 

25from tornado.template import Template as T 

26import pandas as pd 

27 

28df = None # set your dataframe here. 

29narrative = T(\"\"\" 

30 {{ tmpl }} 

31 \"\"\").generate( 

32 tornado_tmpl=True, orgdf=df, fh_args={{ fh_args }}, 

33 G=G, U=U) 

34print(narrative) 

35""" 

36 

37_spacy = { 

38 'model': False, 

39 'lemmatizer': False, 

40 'matcher': False 

41} 

42 

43 

44def load_spacy_model(): 

45 """Load the spacy model when required.""" 

46 if not _spacy['model']: 

47 from spacy import load 

48 nlp = load("en_core_web_sm") 

49 _spacy['model'] = nlp 

50 else: 

51 nlp = _spacy['model'] 

52 return nlp 

53 

54 

55def get_lemmatizer(): 

56 if not _spacy['lemmatizer']: 

57 from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES 

58 from spacy.lemmatizer import Lemmatizer 

59 lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) 

60 _spacy['lemmatizer'] = lemmatizer 

61 else: 

62 lemmatizer = _spacy['lemmatizer'] 

63 return lemmatizer 

64 

65 

66def make_np_matcher(nlp, rules=NP_RULES): 

67 """Make a rule based noun phrase matcher. 

68 

69 Parameters 

70 ---------- 

71 nlp : `spacy.lang` 

72 The spacy model to use. 

73 rules : dict, optional 

74 Mapping of rule IDS to spacy attribute patterns, such that each mapping 

75 defines a noun phrase structure. 

76 

77 Returns 

78 ------- 

79 `spacy.matcher.Matcher` 

80 """ 

81 if not _spacy['matcher']: 

82 from spacy.matcher import Matcher 

83 matcher = Matcher(nlp.vocab) 

84 for k, v in rules.items(): 

85 matcher.add(k, None, v) 

86 _spacy['matcher'] = matcher 

87 else: 

88 matcher = _spacy['matcher'] 

89 return matcher 

90 

91 

92def render_search_result(text, results, **kwargs): 

93 for token, tokenlist in results.items(): 

94 tmpl = [t for t in tokenlist if t.get('enabled', False)][0] 

95 text = text.replace(token, '{{{{ {} }}}}'.format(tmpl['tmpl'])) 

96 return Template(text).generate(**kwargs).decode('utf-8') 

97 

98 

99def join_words(x, sep=' '): 

100 return sep.join(re.findall(r'\w+', x, re.IGNORECASE)) 

101 

102 

103class set_nlg_gramopt(object): # noqa: class to be used as a decorator 

104 """Decorator for adding callables to grammar options of the webapp. 

105 """ 

106 def __init__(self, **kwargs): 

107 self.kwargs = kwargs 

108 

109 def __call__(self, func): 

110 func.gramopt = True 

111 for k, v in self.kwargs.items(): 

112 if not getattr(func, k, False): 112 ↛ 111line 112 didn't jump to line 111, because the condition on line 112 was never false

113 setattr(func, k, v) 

114 return func 

115 

116 

117def is_overlap(x, y): 

118 """Whether the token x is contained within any span in the sequence y.""" 

119 if "NUM" in [c.pos_ for c in x]: 

120 return False 

121 return any([x.text in yy for yy in y]) 

122 

123 

124def unoverlap(tokens): 

125 """From a set of tokens, remove all tokens that are contained within 

126 others.""" 

127 textmap = {c.text: c for c in tokens} 

128 text_tokens = six.viewkeys(textmap) 

129 newtokens = [] 

130 for token in text_tokens: 

131 if not is_overlap(textmap[token], text_tokens - {token}): 

132 newtokens.append(token) 

133 return [textmap[t] for t in newtokens] 

134 

135 

136def ner(doc, matcher, match_ids=False, remove_overlap=True): 

137 """Find all NEs and other nouns in a spacy doc. 

138 

139 Parameters 

140 ---------- 

141 doc: spacy.tokens.doc.Doc 

142 The document in which to search for entities. 

143 matcher: spacy.matcher.Matcher 

144 The rule based matcher to use for finding noun phrases. 

145 match_ids: list, optional 

146 IDs from the spacy matcher to filter from the matches. 

147 remove_overlap: bool, optional 

148 Whether to remove overlapping tokens from the result. 

149 

150 Returns 

151 ------- 

152 list 

153 List of spacy.token.span.Span objects. 

154 """ 

155 entities = set() 

156 for span in doc.ents: 

157 newtokens = [c for c in span if not c.is_space] 

158 if newtokens: 158 ↛ 156line 158 didn't jump to line 156, because the condition on line 158 was never false

159 newspan = doc[newtokens[0].i: (newtokens[-1].i + 1)] 

160 entities.add(newspan) 

161 if not match_ids: 161 ↛ 164line 161 didn't jump to line 164, because the condition on line 161 was never false

162 entities.update([doc[start:end] for _, start, end in matcher(doc)]) 

163 else: 

164 for m_id, start, end in matcher(doc): 

165 if matcher.vocab.strings[m_id] in match_ids: 

166 entities.add(doc[start:end]) 

167 if remove_overlap: 167 ↛ 169line 167 didn't jump to line 169, because the condition on line 167 was never false

168 entities = unoverlap(entities) 

169 return entities 

170 

171 

172def sanitize_indices(shape, i, axis=0): 

173 n = shape[axis] 

174 if i <= n // 2: 

175 return i 

176 return -(n - i) 

177 

178 

179def sanitize_text(text, d_round=2): 

180 """All text cleaning and standardization logic goes here.""" 

181 nums = re.findall(r"\d+\.\d+", text) 

182 for num in nums: 

183 text = re.sub(num, str(round(float(num), d_round)), text) 

184 return text 

185 

186 

187def sanitize_df(df, d_round=2, **options): 

188 """All dataframe cleaning and standardizing logic goes here.""" 

189 for c in df.columns[df.dtypes == float]: 

190 df[c] = df[c].round(d_round) 

191 return df 

192 

193 

194def sanitize_fh_args(args, func=join_words): 

195 for k, v in args.items(): 

196 args[k] = [join_words(x) for x in v] 

197 return args 

198 

199 

200def add_html_styling(template, style): 

201 """Add HTML styling spans to template elements. 

202 

203 Parameters 

204 ---------- 

205 template : str 

206 A tornado template 

207 style : dict or bool 

208 If False, no styling is added. 

209 If True, a default bgcolor is added to template variables. 

210 If dict, expected to contain HTML span styling elements. 

211 

212 Returns 

213 ------- 

214 str 

215 Modified template with each variabled stylized. 

216 

217 Example 

218 ------- 

219 >>> t = 'Hello, {{ name }}!' 

220 >>> add_html_styling(t, True) 

221 'Hello, <span style="background-color:#c8f442">{{ name }}</span>!' 

222 >>> add_html_styling(t, False) 

223 'Hello, {{ name }}!' 

224 >>> add_html_style(t, {'background-color': '#ffffff', 'font-family': 'monospace'}) 

225 'Hello, <span style="background-color:#c8f442;font-family:monospace">{{ name }}</span>!' 

226 """ 

227 

228 if not style: 

229 return template 

230 pattern = re.compile(r'\{\{[^\{\}]+\}\}') 

231 if isinstance(style, dict): 

232 # convert the style dict into a stylized HTML span 

233 spanstyle = ";".join(['{}:{}'.format(k, v) for k, v in style.items()]) 

234 else: 

235 spanstyle = "background-color:#c8f442" 

236 for m in re.finditer(pattern, template): 

237 token = m.group() 

238 repl = '<span style="{ss}">{token}</span>'.format( 

239 ss=spanstyle, token=token) 

240 template = re.sub(re.escape(token), repl, template, 1) 

241 return '<p>{template}</p>'.format(template=template) 

242 

243 

244# @coroutine 

245# def check_grammar(text, lang='en-us'): 

246# """Check `text` for grammatical errors. 

247 

248# Parameters 

249# ---------- 

250# text : str 

251# The text to check. 

252# lang : str, optional 

253# Language of text. 

254# """ 

255# client = AsyncHTTPClient() 

256# query = six.moves.urllib.parse.urlencode({'q': text, 'lang': lang}) 

257# resp = yield client.fetch("http://localhost:9988/admin/nlg/languagetool/?" + query) 

258# raise Return(resp.body)