Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import os 

2import six 

3import json 

4import time 

5import datetime 

6import tornado.httpclient 

7import gramex 

8from oauthlib import oauth1 

9from tornado.ioloop import IOLoop, PeriodicCallback 

10from tornado.httputil import HTTPHeaders, parse_response_start_line 

11from six.moves.urllib_parse import urlencode 

12from gramex.config import app_log 

13from gramex.transforms import flattener, build_transform 

14from gramex.http import (RATE_LIMITED, TOO_MANY_REQUESTS, CLIENT_TIMEOUT, 

15 INTERNAL_SERVER_ERROR, GATEWAY_TIMEOUT) 

16 

17 

18class TwitterStream(object): 

19 ''' 

20 Starts a Twitter Streaming client. Sample usage:: 

21 

22 >>> from gramex.transforms import TwitterStream 

23 >>> stream = TwitterStream( 

24 ... track='modi,mms', 

25 ... path='save-as-file.json', 

26 ... key='...', 

27 ... secret='...', 

28 ... access_key='...', 

29 ... access_secret='...', 

30 ... flush=True) 

31 

32 This saves all tweets mentioning ``modi`` or ``mms`` in ``save-as-file.json`` 

33 with each line representing a tweet in JSN format. 

34 

35 If ``flush=True``, the file is flushed on every tweet. If ``flush=<number>``, 

36 the file is flushed every ``<number>`` seconds. If ``flush=False`` (default), 

37 the file is flushed only when the file or app is closed. 

38 

39 This function runs forever, so run it in a separate thread. 

40 ''' 

41 def __init__(self, **kwargs): 

42 self.params = kwargs 

43 self.url = 'https://stream.twitter.com/1.1/statuses/filter.json' 

44 self.valid_params = { 

45 'follow', 'track', 'locations', 'delimited', 'stall_warnings', 

46 'filter_level', 'language'} 

47 self.enabled = True 

48 self.delay = 0 

49 

50 # Set up writers 

51 if 'path' in kwargs: 

52 self.stream = StreamWriter(kwargs['path'], flush=kwargs.get('flush', False)) 

53 self.process_bytes = self.stream.write 

54 elif 'function' in kwargs: 

55 self.process_json = build_transform( 

56 kwargs, vars={'message': {}}, filename='TwitterStream:function') 

57 elif kwargs.get('driver') == 'sqlalchemy': 

58 engine = gramex.data.create_engine(kwargs['url'], **kwargs.get('parameters', {})) 

59 table = gramex.data.get_table(kwargs['table']) 

60 fields = kwargs['fields'] 

61 for field in list(fields.keys()): 

62 if field not in table.columns: 

63 app_log.error('TwitterStream field %s not in table' % field) 

64 fields.pop(field) 

65 flatten = flattener(fields=fields) 

66 self.process_json = lambda tweet: engine.execute(table.insert(flatten(tweet))) 

67 

68 self.buf = bytearray() 

69 self.client = tornado.httpclient.HTTPClient() 

70 while True: 

71 # Set .enabled to False to temporarily disable streamer 

72 if self.enabled: 

73 params = {key: val.encode('utf-8') for key, val in self.params.items() 

74 if key in self.valid_params} 

75 if 'follow' not in params and 'track' not in params and 'locations' not in params: 

76 self.enabled = False 

77 self.delay = 5 

78 app_log.error('TwitterStream needs follow, track or locations. Disabling') 

79 else: 

80 self.fetch_tweets(params) 

81 # Restart after a delay determined by 

82 time.sleep(self.delay) 

83 

84 def fetch_tweets(self, tweet_params): 

85 oauth = oauth1.Client( 

86 client_key=self.params['key'], 

87 client_secret=self.params['secret'], 

88 resource_owner_key=self.params['access_key'], 

89 resource_owner_secret=self.params['access_secret']) 

90 headers = { 

91 'Content-Type': 'application/x-www-form-urlencoded', 

92 'User-Agent': 'Gramex', 

93 } 

94 url, headers, data = oauth.sign( 

95 self.url, 'POST', body=urlencode(tweet_params), headers=headers) 

96 self.req = tornado.httpclient.HTTPRequest( 

97 method='POST', url=url, body=data, headers=headers, 

98 request_timeout=864000, # Keep request alive for 10 days 

99 streaming_callback=self._stream, 

100 header_callback=self.header_callback) 

101 

102 try: 

103 self.headers = None 

104 self.client.fetch(self.req) 

105 self.delay = 0 

106 except tornado.httpclient.HTTPError as e: 

107 # HTTPError is raised for non-200 HTTP status codes. 

108 # For rate limiting, start with 1 minute and double each attempt 

109 if e.code in {RATE_LIMITED, TOO_MANY_REQUESTS}: 

110 self.delay = self.delay * 2 if self.delay else 60 

111 app_log.error('TwitterStream HTTP %d (rate limited): %s. Retry: %ss', 

112 e.code, e.response, self.delay) 

113 # For Tornado timeout errors, reconnect immediately 

114 elif e.code == CLIENT_TIMEOUT: 

115 self.delay = 0 

116 app_log.error('TwitterStream HTTP %d (timeout): %s. Retry: %ss', 

117 e.code, e.response, self.delay) 

118 # For server errors, start with 5 seconds and double until 320 seconds 

119 elif INTERNAL_SERVER_ERROR <= e.code <= GATEWAY_TIMEOUT: 

120 self.delay = min(320, self.delay * 2 if self.delay else 1) # noqa: 320 seconds 

121 app_log.error('TwitterStream HTTP %d: %s. Retry: %ss', 

122 e.code, e.response, self.delay) 

123 # For client errors (e.g. wrong params), disable connection 

124 else: 

125 self.delay, self.enabled = 5, False 

126 app_log.error('TwitterStream HTTP %d: %s. Disabling', e.code, e.response) 

127 except Exception as e: 

128 # Other errors are possible, such as IOError. 

129 # Increase the delay in reconnects by 250ms each attempt, up to 16 seconds. 

130 self.delay = min(16, self.delay + 0.25) # noqa: 16 seconds, 0.25 seconds 

131 app_log.error('TwitterStream exception %s. Retry: %ss', e, self.delay) 

132 

133 def header_callback(self, line): 

134 try: 

135 if self.headers is None: 

136 start_line = parse_response_start_line(line) 

137 self.http_version, self.status_code, self.http_reason = start_line 

138 self.headers = HTTPHeaders() 

139 else: 

140 self.headers.parse_line(line) 

141 except Exception: 

142 app_log.exception('Cannot parse header %s' % line) 

143 

144 def _stream(self, data): 

145 buf = self.buf 

146 buf.extend(data) 

147 while len(buf): 

148 index = buf.find(b'\r\n') 

149 if index < 0: 

150 break 

151 data = bytes(buf[:index]) 

152 del buf[:index + 2] 

153 # Ignore stall warnings 

154 if len(data) == 0: 

155 continue 

156 try: 

157 self.process_bytes(data) 

158 except Exception: 

159 app_log.exception('TwitterStream could not process: %s' % data) 

160 

161 def process_bytes(self, data): 

162 try: 

163 text = six.text_type(data, encoding='utf-8') 

164 message = json.loads(text) 

165 except UnicodeError: 

166 app_log.error('TwitterStream unicode error: %s', data) 

167 return 

168 except ValueError: 

169 # When rate limited, text="Exceeded connection limit for user" 

170 app_log.error('TwitterStream non-JSON data: %s', text) 

171 return 

172 # Process the message (which is usually, but not always, a tweet) 

173 try: 

174 self.process_json(message) 

175 except Exception: 

176 app_log.exception('TwitterStream could not process message: %s' % text) 

177 

178 def process_json(self, message): 

179 '''Subclass this to process tweets differently''' 

180 app_log.info(repr(message)) 

181 

182 

183class StreamWriter(object): 

184 def __init__(self, path, flush=False): 

185 self.path = path 

186 self.stream = self.stream_path = self.flush_on_write = None 

187 if isinstance(flush, bool): 

188 self.flush_on_write = flush 

189 elif isinstance(flush, (int, float)): 

190 self.flush_loop = PeriodicCallback(self.flush, flush * 1000) 

191 self.flush_loop.start() 

192 else: 

193 raise ValueError('flush=%r is not int/bool' % flush) 

194 self.rotate() 

195 

196 def flush(self): 

197 if self.stream is not None: 

198 self.stream.flush() 

199 

200 def rotate(self): 

201 ''' 

202 Create and rotate file streams. 

203 

204 The ``path`` format string determines the filename. For example, 

205 ``tweets.{:%Y-%m-%d}.jsonl`` creates a filename based on the current 

206 date, e.g. ``tweets.2016-12-31.jsonl``. When rotating, if the new 

207 filename is the same as the old, the file continues. If it's a different 

208 file, the old file is closed and the new file is created. 

209 

210 The rotation frequency is based on the crontab entries in the config, 

211 i.e. based on ``hours``, ``days``, ``weeks``, etc. It defaults to every 

212 minute. 

213 ''' 

214 # First, flush the stream to ensure that data is not lost. 

215 # Then set up new stream (if required, based on the filename) 

216 self.flush() 

217 path = self.path.format(datetime.datetime.utcnow()) 

218 if path != self.stream_path: 

219 if self.stream is not None: 

220 self.stream.close() 

221 self.stream_path = path 

222 folder = os.path.dirname(os.path.abspath(path)) 

223 if not os.path.exists(folder): 

224 os.makedirs(folder) 

225 self.stream = open(path, 'ab') 

226 app_log.debug('StreamWriter writing to %s', path) 

227 

228 # Schedule the next call after a minute 

229 IOLoop.current().call_later(60, self.rotate) 

230 

231 def write(self, data): 

232 self.stream.write(data) 

233 self.stream.write('\n') 

234 if self.flush_on_write: 

235 self.stream.flush()