Coverage for gramex\transforms\twitterstream.py : 13%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import os
2import six
3import json
4import time
5import datetime
6import tornado.httpclient
7import gramex
8from oauthlib import oauth1
9from tornado.ioloop import IOLoop, PeriodicCallback
10from tornado.httputil import HTTPHeaders, parse_response_start_line
11from six.moves.urllib_parse import urlencode
12from gramex.config import app_log
13from gramex.transforms import flattener, build_transform
14from gramex.http import (RATE_LIMITED, TOO_MANY_REQUESTS, CLIENT_TIMEOUT,
15 INTERNAL_SERVER_ERROR, GATEWAY_TIMEOUT)
18class TwitterStream(object):
19 '''
20 Starts a Twitter Streaming client. Sample usage::
22 >>> from gramex.transforms import TwitterStream
23 >>> stream = TwitterStream(
24 ... track='modi,mms',
25 ... path='save-as-file.json',
26 ... key='...',
27 ... secret='...',
28 ... access_key='...',
29 ... access_secret='...',
30 ... flush=True)
32 This saves all tweets mentioning ``modi`` or ``mms`` in ``save-as-file.json``
33 with each line representing a tweet in JSN format.
35 If ``flush=True``, the file is flushed on every tweet. If ``flush=<number>``,
36 the file is flushed every ``<number>`` seconds. If ``flush=False`` (default),
37 the file is flushed only when the file or app is closed.
39 This function runs forever, so run it in a separate thread.
40 '''
41 def __init__(self, **kwargs):
42 self.params = kwargs
43 self.url = 'https://stream.twitter.com/1.1/statuses/filter.json'
44 self.valid_params = {
45 'follow', 'track', 'locations', 'delimited', 'stall_warnings',
46 'filter_level', 'language'}
47 self.enabled = True
48 self.delay = 0
50 # Set up writers
51 if 'path' in kwargs:
52 self.stream = StreamWriter(kwargs['path'], flush=kwargs.get('flush', False))
53 self.process_bytes = self.stream.write
54 elif 'function' in kwargs:
55 self.process_json = build_transform(
56 kwargs, vars={'message': {}}, filename='TwitterStream:function')
57 elif kwargs.get('driver') == 'sqlalchemy':
58 engine = gramex.data.create_engine(kwargs['url'], **kwargs.get('parameters', {}))
59 table = gramex.data.get_table(kwargs['table'])
60 fields = kwargs['fields']
61 for field in list(fields.keys()):
62 if field not in table.columns:
63 app_log.error('TwitterStream field %s not in table' % field)
64 fields.pop(field)
65 flatten = flattener(fields=fields)
66 self.process_json = lambda tweet: engine.execute(table.insert(flatten(tweet)))
68 self.buf = bytearray()
69 self.client = tornado.httpclient.HTTPClient()
70 while True:
71 # Set .enabled to False to temporarily disable streamer
72 if self.enabled:
73 params = {key: val.encode('utf-8') for key, val in self.params.items()
74 if key in self.valid_params}
75 if 'follow' not in params and 'track' not in params and 'locations' not in params:
76 self.enabled = False
77 self.delay = 5
78 app_log.error('TwitterStream needs follow, track or locations. Disabling')
79 else:
80 self.fetch_tweets(params)
81 # Restart after a delay determined by
82 time.sleep(self.delay)
84 def fetch_tweets(self, tweet_params):
85 oauth = oauth1.Client(
86 client_key=self.params['key'],
87 client_secret=self.params['secret'],
88 resource_owner_key=self.params['access_key'],
89 resource_owner_secret=self.params['access_secret'])
90 headers = {
91 'Content-Type': 'application/x-www-form-urlencoded',
92 'User-Agent': 'Gramex',
93 }
94 url, headers, data = oauth.sign(
95 self.url, 'POST', body=urlencode(tweet_params), headers=headers)
96 self.req = tornado.httpclient.HTTPRequest(
97 method='POST', url=url, body=data, headers=headers,
98 request_timeout=864000, # Keep request alive for 10 days
99 streaming_callback=self._stream,
100 header_callback=self.header_callback)
102 try:
103 self.headers = None
104 self.client.fetch(self.req)
105 self.delay = 0
106 except tornado.httpclient.HTTPError as e:
107 # HTTPError is raised for non-200 HTTP status codes.
108 # For rate limiting, start with 1 minute and double each attempt
109 if e.code in {RATE_LIMITED, TOO_MANY_REQUESTS}:
110 self.delay = self.delay * 2 if self.delay else 60
111 app_log.error('TwitterStream HTTP %d (rate limited): %s. Retry: %ss',
112 e.code, e.response, self.delay)
113 # For Tornado timeout errors, reconnect immediately
114 elif e.code == CLIENT_TIMEOUT:
115 self.delay = 0
116 app_log.error('TwitterStream HTTP %d (timeout): %s. Retry: %ss',
117 e.code, e.response, self.delay)
118 # For server errors, start with 5 seconds and double until 320 seconds
119 elif INTERNAL_SERVER_ERROR <= e.code <= GATEWAY_TIMEOUT:
120 self.delay = min(320, self.delay * 2 if self.delay else 1) # noqa: 320 seconds
121 app_log.error('TwitterStream HTTP %d: %s. Retry: %ss',
122 e.code, e.response, self.delay)
123 # For client errors (e.g. wrong params), disable connection
124 else:
125 self.delay, self.enabled = 5, False
126 app_log.error('TwitterStream HTTP %d: %s. Disabling', e.code, e.response)
127 except Exception as e:
128 # Other errors are possible, such as IOError.
129 # Increase the delay in reconnects by 250ms each attempt, up to 16 seconds.
130 self.delay = min(16, self.delay + 0.25) # noqa: 16 seconds, 0.25 seconds
131 app_log.error('TwitterStream exception %s. Retry: %ss', e, self.delay)
133 def header_callback(self, line):
134 try:
135 if self.headers is None:
136 start_line = parse_response_start_line(line)
137 self.http_version, self.status_code, self.http_reason = start_line
138 self.headers = HTTPHeaders()
139 else:
140 self.headers.parse_line(line)
141 except Exception:
142 app_log.exception('Cannot parse header %s' % line)
144 def _stream(self, data):
145 buf = self.buf
146 buf.extend(data)
147 while len(buf):
148 index = buf.find(b'\r\n')
149 if index < 0:
150 break
151 data = bytes(buf[:index])
152 del buf[:index + 2]
153 # Ignore stall warnings
154 if len(data) == 0:
155 continue
156 try:
157 self.process_bytes(data)
158 except Exception:
159 app_log.exception('TwitterStream could not process: %s' % data)
161 def process_bytes(self, data):
162 try:
163 text = six.text_type(data, encoding='utf-8')
164 message = json.loads(text)
165 except UnicodeError:
166 app_log.error('TwitterStream unicode error: %s', data)
167 return
168 except ValueError:
169 # When rate limited, text="Exceeded connection limit for user"
170 app_log.error('TwitterStream non-JSON data: %s', text)
171 return
172 # Process the message (which is usually, but not always, a tweet)
173 try:
174 self.process_json(message)
175 except Exception:
176 app_log.exception('TwitterStream could not process message: %s' % text)
178 def process_json(self, message):
179 '''Subclass this to process tweets differently'''
180 app_log.info(repr(message))
183class StreamWriter(object):
184 def __init__(self, path, flush=False):
185 self.path = path
186 self.stream = self.stream_path = self.flush_on_write = None
187 if isinstance(flush, bool):
188 self.flush_on_write = flush
189 elif isinstance(flush, (int, float)):
190 self.flush_loop = PeriodicCallback(self.flush, flush * 1000)
191 self.flush_loop.start()
192 else:
193 raise ValueError('flush=%r is not int/bool' % flush)
194 self.rotate()
196 def flush(self):
197 if self.stream is not None:
198 self.stream.flush()
200 def rotate(self):
201 '''
202 Create and rotate file streams.
204 The ``path`` format string determines the filename. For example,
205 ``tweets.{:%Y-%m-%d}.jsonl`` creates a filename based on the current
206 date, e.g. ``tweets.2016-12-31.jsonl``. When rotating, if the new
207 filename is the same as the old, the file continues. If it's a different
208 file, the old file is closed and the new file is created.
210 The rotation frequency is based on the crontab entries in the config,
211 i.e. based on ``hours``, ``days``, ``weeks``, etc. It defaults to every
212 minute.
213 '''
214 # First, flush the stream to ensure that data is not lost.
215 # Then set up new stream (if required, based on the filename)
216 self.flush()
217 path = self.path.format(datetime.datetime.utcnow())
218 if path != self.stream_path:
219 if self.stream is not None:
220 self.stream.close()
221 self.stream_path = path
222 folder = os.path.dirname(os.path.abspath(path))
223 if not os.path.exists(folder):
224 os.makedirs(folder)
225 self.stream = open(path, 'ab')
226 app_log.debug('StreamWriter writing to %s', path)
228 # Schedule the next call after a minute
229 IOLoop.current().call_later(60, self.rotate)
231 def write(self, data):
232 self.stream.write(data)
233 self.stream.write('\n')
234 if self.flush_on_write:
235 self.stream.flush()