You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
bitter/bitter/crawlers.py

148 lines
4.5 KiB
Python

import time
import urllib
import random
import json
import logging
logger = logging.getLogger(__name__)
from twitter import *
from collections import OrderedDict
from threading import Lock
from . import utils
from . import config
class AttrToFunc(object):
def __init__(self, uriparts=None, handler=None):
if uriparts:
self.__uriparts = uriparts
else:
self.__uriparts = []
#self.__uriparts = []
self.handler = handler
def __getattr__(self, k):
def extend_call(arg):
return AttrToFunc(
uriparts=self.__uriparts + [arg,],
handler=self.handler)
if k == "_":
return extend_call
else:
return extend_call(k)
def __call__(self, *args, **kwargs):
# for i, a in enumerate(args)e
# kwargs[i] = a
return self.handler(self.__uriparts, *args, **kwargs)
class TwitterWorker(object):
def __init__(self, name, client):
self.name = name
self.client = client
self.throttled_time = False
self._lock = Lock()
self.busy = False
@property
def throttled(self):
if not self.throttled_time:
return False
t = time.time()
delta = self.throttled_time - t
if delta > 0:
return True
return False
def throttle_until(self, epoch=None):
self.throttled_time = int(epoch)
logger.info("Worker %s throttled for %s seconds" % (self.name, str(epoch-time.time())))
class TwitterQueue(AttrToFunc):
def __init__(self, wait=True):
logger.debug('Creating worker queue')
self.queue = set()
self.index = 0
self.wait = wait
AttrToFunc.__init__(self, handler=self.handle_call)
def ready(self, worker):
self.queue.add(worker)
def handle_call(self, uriparts, *args, **kwargs):
logger.debug('Called: {}'.format(uriparts))
logger.debug('With: {} {}'.format(args, kwargs))
while True:
c = None
try:
c = self.next()
c._lock.acquire()
c.busy = True
logger.debug('Next: {}'.format(c.name))
ping = time.time()
resp = getattr(c.client, "/".join(uriparts))(*args, **kwargs)
pong = time.time()
logger.debug('Took: {}'.format(pong-ping))
return resp
except TwitterHTTPError as ex:
if ex.e.code in (429, 502, 503, 504):
limit = ex.e.headers.get('X-Rate-Limit-Reset', time.time() + 30)
logger.info('{} limited'.format(c.name))
c.throttle_until(limit)
continue
else:
raise
except urllib.error.URLError as ex:
time.sleep(5)
logger.info('Something fishy happened: {}'.format(ex))
finally:
if c:
c.busy = False
c._lock.release()
@property
def client(self):
return self.next().client
@classmethod
def from_credentials(self, cred_file=None):
wq = TwitterQueue()
for cred in utils.get_credentials(cred_file):
c = Twitter(auth=OAuth(cred['token_key'],
cred['token_secret'],
cred['consumer_key'],
cred['consumer_secret']))
wq.ready(TwitterWorker(cred["user"], c))
return wq
def _next(self):
logger.debug('Getting next available')
s = list(self.queue)
random.shuffle(s)
for worker in s:
if not worker.throttled and not worker.busy:
return worker
raise Exception('No worker is available')
def next(self):
if not self.wait:
return self._next()
while True:
try:
return self._next()
except Exception:
available = filter(lambda x: not x.busy, self.queue)
if available:
first_worker = min(available, key=lambda x: x.throttled_time)
diff = first_worker.throttled_time - time.time()
logger.info("All workers are throttled. Waiting %s seconds" % diff)
else:
diff = 5
logger.info("All workers are busy. Waiting %s seconds" % diff)
time.sleep(diff)