mirror of
https://github.com/balkian/bitter.git
synced 2024-12-22 08:28:12 +00:00
First commit
This commit is contained in:
commit
d0de6c2ea9
4
.dockerignore
Normal file
4
.dockerignore
Normal file
@ -0,0 +1,4 @@
|
||||
.*
|
||||
env
|
||||
*.egg-info
|
||||
dist
|
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@ -0,0 +1,6 @@
|
||||
*.egg-info
|
||||
dist
|
||||
env
|
||||
__*
|
||||
.*
|
||||
*.pyc
|
8
Dockerfile
Normal file
8
Dockerfile
Normal file
@ -0,0 +1,8 @@
|
||||
# onbuild copies . to /usr/src/app/
|
||||
From python:2.7.9-onbuild
|
||||
Maintainer J. Fernando Sánchez @balkian
|
||||
|
||||
# RUN pip --cert cacert.pem install -r -v requirements.txt
|
||||
|
||||
RUN pip install --editable .;
|
||||
ENTRYPOINT ["bitter"]
|
3
MANIFEST.in
Normal file
3
MANIFEST.in
Normal file
@ -0,0 +1,3 @@
|
||||
include requirements.txt
|
||||
include test-requirements.txt
|
||||
include README.md
|
281
bitter/cli.py
Normal file
281
bitter/cli.py
Normal file
@ -0,0 +1,281 @@
|
||||
import click
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import sqlalchemy.types
|
||||
import threading
|
||||
import sqlite3
|
||||
|
||||
from six.moves import map, filter, queue
|
||||
from sqlalchemy import exists
|
||||
|
||||
from bitter import utils, models, crawlers
|
||||
from bitter.models import make_session, User, ExtractorEntry, Following
|
||||
from contextlib import ExitStack
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@click.group()
|
||||
@click.option("--verbose", is_flag=True)
|
||||
@click.option("--logging_level", required=False, default='WARN')
|
||||
@click.option("--config", required=False)
|
||||
@click.option('-c', '--credentials',show_default=True, default='credentials.json')
|
||||
@click.pass_context
|
||||
def main(ctx, verbose, logging_level, config, credentials):
|
||||
logging.basicConfig(level=getattr(logging, logging_level))
|
||||
ctx.obj = {}
|
||||
ctx.obj['VERBOSE'] = verbose
|
||||
ctx.obj['CONFIG'] = config
|
||||
ctx.obj['CREDENTIALS'] = credentials
|
||||
|
||||
|
||||
@main.group()
|
||||
@click.pass_context
|
||||
def tweet(ctx):
|
||||
pass
|
||||
|
||||
@tweet.command('get')
|
||||
@click.argument('tweetid')
|
||||
@click.pass_context
|
||||
def get_tweet(ctx, tweetid):
|
||||
wq = crawlers.TwitterQueue.from_credentials(ctx.obj['CREDENTIALS'])
|
||||
c = wq.next()
|
||||
t = crawlers.get_tweet(c.client, tweetid)
|
||||
print(json.dumps(t, indent=2))
|
||||
|
||||
|
||||
@tweet.command('search')
|
||||
@click.argument('query')
|
||||
@click.pass_context
|
||||
def get_tweet(ctx, query):
|
||||
wq = crawlers.TwitterQueue.from_credentials(ctx.obj['CREDENTIALS'])
|
||||
c = wq.next()
|
||||
t = utils.search_tweet(c.client, query)
|
||||
print(json.dumps(t, indent=2))
|
||||
|
||||
@main.group()
|
||||
@click.pass_context
|
||||
def users(ctx):
|
||||
pass
|
||||
|
||||
@users.command('list')
|
||||
@click.option('--db', required=True, help='Database of users.')
|
||||
@click.pass_context
|
||||
def list_users(ctx, db):
|
||||
dburl = 'sqlite:///{}'.format(db)
|
||||
session = make_session(dburl)
|
||||
for i in session.query(User):
|
||||
print(i.screen_name)
|
||||
for j in i.__dict__:
|
||||
print('\t{}: {}'.format(j, getattr(i,j)))
|
||||
|
||||
@users.command('get_one')
|
||||
@click.argument('user')
|
||||
@click.pass_context
|
||||
def get_user(ctx, user):
|
||||
wq = crawlers.TwitterQueue.from_credentials(ctx.obj['CREDENTIALS'])
|
||||
c = wq.next()
|
||||
u = utils.get_user(c.client, user)
|
||||
print(json.dumps(u, indent=2))
|
||||
|
||||
@users.command('get')
|
||||
@click.option('--db', required=True, help='Database to save all users.')
|
||||
@click.option('--skip', required=False, default=0, help='Skip N lines from the file.')
|
||||
@click.option('--until', required=False, type=str, default=0, help='Skip all lines until ID.')
|
||||
@click.option('--threads', required=False, type=str, default=20, help='Number of crawling threads.')
|
||||
@click.argument('usersfile', 'File with a list of users to look up')
|
||||
@click.pass_context
|
||||
def get_users(ctx, usersfile, skip, until, threads, db):
|
||||
global dburl, ids_queue, skipped, enqueued, collected, lastid, db_lock
|
||||
|
||||
if '://' not in db:
|
||||
dburl = 'sqlite:///{}'.format(db)
|
||||
db_lock = threading.Lock()
|
||||
else:
|
||||
dburl = db
|
||||
def db_lock():
|
||||
return ExitStack()
|
||||
|
||||
|
||||
wq = crawlers.TwitterQueue.from_credentials(ctx.obj['CREDENTIALS'])
|
||||
logger.info('Starting Network crawler with {} threads and {} credentials.'.format(threads,
|
||||
len(wq.queue)))
|
||||
|
||||
ids_queue = queue.Queue(1000)
|
||||
skipped = skip
|
||||
enqueued = 0
|
||||
collected = 0
|
||||
statslock = threading.Lock()
|
||||
lastid = -1
|
||||
|
||||
def fill_queue():
|
||||
global enqueued, skipped
|
||||
with open(usersfile, 'r') as f:
|
||||
sqlite = sqlite3.connect(db)
|
||||
engine = sqlalchemy.create_engine(dburl)
|
||||
def user_filter(x):
|
||||
global skipped, dburl
|
||||
# keep = data['users'].find_one(id=x) is None
|
||||
#keep = not session.query(exists().where(User.id == x)).scalar()
|
||||
# keep = session.engine.execute
|
||||
keep = not list(engine.execute('SELECT 1 from users where id=\'%s\'' % x))
|
||||
|
||||
if not keep:
|
||||
skipped += 1
|
||||
return keep
|
||||
for i in range(skip):
|
||||
next(f)
|
||||
ilist = map(lambda x: x.strip(), f)
|
||||
logger.info('Skipping until {}'.format(until))
|
||||
if not skip and until:
|
||||
for uid in ilist:
|
||||
if uid == until:
|
||||
break
|
||||
else:
|
||||
skipped += 1
|
||||
ilist = filter(user_filter, ilist)
|
||||
for uid in ilist:
|
||||
ids_queue.put(uid)
|
||||
enqueued += 1
|
||||
for i in range(threads):
|
||||
ids_queue.put(None)
|
||||
|
||||
def consume_queue():
|
||||
global dburl, collected, ids_queue, lastid
|
||||
local_collected = 0
|
||||
logging.debug('Consuming!')
|
||||
session = make_session(dburl)
|
||||
q_iter = iter(ids_queue.get, None)
|
||||
for user in utils.get_users(wq, q_iter):
|
||||
user['entities'] = json.dumps(user['entities'])
|
||||
dbuser = User(**user)
|
||||
session.add(dbuser)
|
||||
local_collected += 1
|
||||
with statslock:
|
||||
collected += 1
|
||||
lastid = user['id']
|
||||
if local_collected % 100 == 0:
|
||||
with db_lock:
|
||||
session.commit()
|
||||
session.commit()
|
||||
logger.debug('Done consuming')
|
||||
|
||||
filler = threading.Thread(target=fill_queue)
|
||||
filler.start()
|
||||
consumers = [threading.Thread(target=consume_queue) for i in range(threads)]
|
||||
logging.debug('Starting consumers')
|
||||
for c in consumers:
|
||||
c.start()
|
||||
logging.debug('Joining filler')
|
||||
counter = 0
|
||||
speed = 0
|
||||
lastcollected = collected
|
||||
while True:
|
||||
filler.join(1)
|
||||
logger.info('########\n'
|
||||
' Collected: {}\n'
|
||||
' Speed: ~ {} profiles/s\n'
|
||||
' Skipped: {}\n'
|
||||
' Enqueued: {}\n'
|
||||
' Queue size: {}\n'
|
||||
' Last ID: {}'.format(collected, speed, skipped, enqueued, ids_queue.qsize(), lastid))
|
||||
if not filler.isAlive():
|
||||
if all(not i.isAlive() for i in consumers):
|
||||
break
|
||||
else:
|
||||
time.sleep(1)
|
||||
counter += 1
|
||||
if counter % 10 == 0:
|
||||
speed = (collected-lastcollected)/10
|
||||
with statslock:
|
||||
lastcollected = collected
|
||||
|
||||
logger.info('Done!')
|
||||
|
||||
@main.group('api')
|
||||
def api():
|
||||
pass
|
||||
|
||||
|
||||
@main.group('extractor')
|
||||
@click.pass_context
|
||||
@click.option('--db', required=True, help='Database of users.')
|
||||
def extractor(ctx, db):
|
||||
if '://' not in db:
|
||||
db = 'sqlite:///{}'.format(db)
|
||||
ctx.obj['DBURI'] = db
|
||||
ctx.obj['SESSION'] = make_session(db)
|
||||
|
||||
|
||||
@extractor.command('status')
|
||||
@click.option('--with_followers', is_flag=True, default=False)
|
||||
@click.option('--with_not_pending', is_flag=True, default=False)
|
||||
@click.pass_context
|
||||
def status_extractor(ctx, with_followers, with_not_pending):
|
||||
session = ctx.obj['SESSION']
|
||||
entries = session.query(ExtractorEntry)
|
||||
if not with_not_pending:
|
||||
entries = entries.filter(ExtractorEntry.pending==True)
|
||||
for i in entries:
|
||||
print(i.id)
|
||||
for j in i.__dict__:
|
||||
print('\t{}: {}'.format(j, getattr(i,j)))
|
||||
followers = session.query(Following)
|
||||
print('Followers count: {}'.format(followers.count()))
|
||||
if(with_followers):
|
||||
for i in followers:
|
||||
print(i.id)
|
||||
for j in i.__dict__:
|
||||
print('\t{}: {}'.format(j, getattr(i,j)))
|
||||
|
||||
|
||||
@extractor.command()
|
||||
@click.option('--recursive', is_flag=True, help='Get following/follower/info recursively.', default=False)
|
||||
@click.option('-u', '--user', default=None)
|
||||
@click.option('-n', '--name', show_default=True, default='extractor')
|
||||
@click.option('-i', '--initfile', required=False, default=None, help='List of users to load')
|
||||
@click.pass_context
|
||||
def extract(ctx, recursive, user, name, initfile):
|
||||
print(locals())
|
||||
wq = crawlers.TwitterQueue.from_credentials(ctx.obj['CREDENTIALS'])
|
||||
dburi = ctx.obj['DBURI']
|
||||
utils.extract(wq,
|
||||
recursive=recursive,
|
||||
user=user,
|
||||
dburi=dburi,
|
||||
initfile=initfile,
|
||||
extractor_name=name)
|
||||
|
||||
@extractor.command('reset')
|
||||
@click.pass_context
|
||||
def reset_extractor(ctx):
|
||||
wq = crawlers.TwitterQueue.from_credentials(ctx.obj['CREDENTIALS'])
|
||||
db = ctx.obj['DBURI']
|
||||
session = make_session(db)
|
||||
session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})
|
||||
|
||||
|
||||
|
||||
@api.command('limits')
|
||||
@click.argument('url', required=False)
|
||||
@click.pass_context
|
||||
def get_limits(ctx, url):
|
||||
wq = crawlers.TwitterQueue.from_credentials(ctx.obj['CREDENTIALS'])
|
||||
for worker in wq.queue:
|
||||
resp = worker.client.application.rate_limit_status()
|
||||
print('#'*20)
|
||||
print(worker.name)
|
||||
if url:
|
||||
limit = 'NOT FOUND'
|
||||
print('URL is: {}'.format(url))
|
||||
cat = url.split('/')[1]
|
||||
if cat in resp['resources']:
|
||||
limit = resp['resources'][cat].get(url, None) or resp['resources'][cat]
|
||||
else:
|
||||
print('Cat {} not found'.format(cat))
|
||||
print('{}: {}'.format(url, limit))
|
||||
else:
|
||||
print(json.dumps(resp, indent=2))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
143
bitter/crawlers.py
Normal file
143
bitter/crawlers.py
Normal file
@ -0,0 +1,143 @@
|
||||
import time
|
||||
import urllib
|
||||
import random
|
||||
import json
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from twitter import *
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
class AttrToFunc(object):
|
||||
def __init__(self, uriparts=None, handler=None):
|
||||
if uriparts:
|
||||
self.__uriparts = uriparts
|
||||
else:
|
||||
self.__uriparts = []
|
||||
#self.__uriparts = []
|
||||
self.handler = handler
|
||||
|
||||
def __getattr__(self, k):
|
||||
def extend_call(arg):
|
||||
return AttrToFunc(
|
||||
uriparts=self.__uriparts + [arg,],
|
||||
handler=self.handler)
|
||||
if k == "_":
|
||||
return extend_call
|
||||
else:
|
||||
return extend_call(k)
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
# for i, a in enumerate(args)e
|
||||
# kwargs[i] = a
|
||||
return self.handler(self.__uriparts, *args, **kwargs)
|
||||
|
||||
class TwitterWorker(object):
|
||||
def __init__(self, name, client):
|
||||
self.name = name
|
||||
self.client = client
|
||||
self.throttled_time = False
|
||||
self.busy = False
|
||||
|
||||
@property
|
||||
def throttled(self):
|
||||
if not self.throttled_time:
|
||||
return False
|
||||
t = time.time()
|
||||
delta = self.throttled_time - t
|
||||
if delta > 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
def throttle_until(self, epoch=None):
|
||||
self.throttled_time = int(epoch)
|
||||
logger.info("Worker %s throttled for %s seconds" % (self.name, str(epoch-time.time())))
|
||||
|
||||
|
||||
class TwitterQueue(AttrToFunc):
|
||||
def __init__(self, wait=True):
|
||||
logger.debug('Creating worker queue')
|
||||
self.queue = set()
|
||||
self.index = 0
|
||||
self.wait = wait
|
||||
AttrToFunc.__init__(self, handler=self.handle_call)
|
||||
|
||||
def ready(self, worker):
|
||||
self.queue.add(worker)
|
||||
|
||||
def handle_call(self, uriparts, *args, **kwargs):
|
||||
logger.debug('Called: {}'.format(uriparts))
|
||||
logger.debug('With: {} {}'.format(args, kwargs))
|
||||
while True:
|
||||
c = None
|
||||
try:
|
||||
c = self.next()
|
||||
c.busy = True
|
||||
logger.debug('Next: {}'.format(c.name))
|
||||
ping = time.time()
|
||||
resp = getattr(c.client, "/".join(uriparts))(*args, **kwargs)
|
||||
pong = time.time()
|
||||
logger.debug('Took: {}'.format(pong-ping))
|
||||
return resp
|
||||
except TwitterHTTPError as ex:
|
||||
if ex.e.code in (429, 502, 503, 504):
|
||||
limit = ex.e.headers.get('X-Rate-Limit-Reset', time.time() + 30)
|
||||
logger.info('{} limited'.format(c.name))
|
||||
c.throttle_until(limit)
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
except urllib.error.URLError as ex:
|
||||
time.sleep(5)
|
||||
logger.info('Something fishy happened: {}'.format(ex))
|
||||
finally:
|
||||
if c:
|
||||
c.busy = False
|
||||
|
||||
|
||||
@property
|
||||
def client(self):
|
||||
return self.next().client
|
||||
|
||||
@classmethod
|
||||
def from_credentials(self, cred_file):
|
||||
wq = TwitterQueue()
|
||||
|
||||
with open(cred_file) as f:
|
||||
for line in f:
|
||||
cred = json.loads(line)
|
||||
c = Twitter(auth=OAuth(cred['token_key'],
|
||||
cred['token_secret'],
|
||||
cred['consumer_key'],
|
||||
cred['consumer_secret']))
|
||||
wq.ready(TwitterWorker(cred["user"], c))
|
||||
return wq
|
||||
|
||||
def _next(self):
|
||||
logger.debug('Getting next available')
|
||||
s = list(self.queue)
|
||||
random.shuffle(s)
|
||||
for worker in s:
|
||||
if not worker.throttled and not worker.busy:
|
||||
return worker
|
||||
raise Exception('No worker is available')
|
||||
|
||||
def next(self):
|
||||
if not self.wait:
|
||||
return self._next()
|
||||
while True:
|
||||
try:
|
||||
return self._next()
|
||||
except Exception:
|
||||
available = filter(lambda x: not x.busy, self.queue)
|
||||
if available:
|
||||
first_worker = min(available, key=lambda x: x.throttled_time)
|
||||
diff = first_worker.throttled_time - time.time()
|
||||
logger.info("All workers are throttled. Waiting %s seconds" % diff)
|
||||
else:
|
||||
diff = 5
|
||||
logger.info("All workers are busy. Waiting %s seconds" % diff)
|
||||
time.sleep(diff)
|
||||
|
105
bitter/models.py
Normal file
105
bitter/models.py
Normal file
@ -0,0 +1,105 @@
|
||||
import time
|
||||
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.types import BigInteger, Integer, Text, Boolean
|
||||
from sqlalchemy import Column, Index
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
class User(Base):
|
||||
__tablename__ = 'users'
|
||||
|
||||
id = Column(BigInteger, primary_key=True, index=True, unique=True)
|
||||
contributors_enabled = Column(Boolean)
|
||||
created_at_stamp = Column(Text)
|
||||
default_profile = Column(Boolean)
|
||||
default_profile_image = Column(Boolean)
|
||||
description = Column(Text)
|
||||
entities = Column(Text)
|
||||
favourites_count = Column(Integer)
|
||||
followers_count = Column(Integer)
|
||||
following = Column(Boolean)
|
||||
friends_count = Column(Integer)
|
||||
geo_enabled = Column(Boolean)
|
||||
has_extended_profile = Column(Boolean)
|
||||
id_str = Column(Text)
|
||||
is_translation_enabled = Column(Boolean)
|
||||
is_translator = Column(Boolean)
|
||||
lang = Column(Text)
|
||||
listed_count = Column(Integer)
|
||||
location = Column(Text)
|
||||
name = Column(Text)
|
||||
notifications = Column(Boolean)
|
||||
profile_background_color = Column(Text)
|
||||
profile_background_image_url = Column(Text)
|
||||
profile_background_image_url_https = Column(Text)
|
||||
profile_background_tile = Column(Boolean)
|
||||
profile_banner_url = Column(Text)
|
||||
profile_image_url = Column(Text)
|
||||
profile_image_url_https = Column(Text)
|
||||
profile_link_color = Column(Text)
|
||||
profile_sidebar_border_color = Column(Text)
|
||||
profile_sidebar_fill_color = Column(Text)
|
||||
profile_text_color = Column(Text)
|
||||
profile_use_background_image = Column(Boolean)
|
||||
protected = Column(Boolean)
|
||||
screen_name = Column(Text)
|
||||
statuses_count = Column(Integer)
|
||||
time_zone = Column(Text)
|
||||
url = Column(Text)
|
||||
utc_offset = Column(Integer)
|
||||
verified = Column(Boolean)
|
||||
|
||||
|
||||
class Following(Base):
|
||||
__tablename__ = 'followers'
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
isfollowed = Column(Integer)
|
||||
follower = Column(Integer)
|
||||
created_at_stamp = Column(Text)
|
||||
|
||||
follower_index = Index('isfollowed', 'follower')
|
||||
|
||||
class ExtractorEntry(Base):
|
||||
__tablename__ = 'extractor-cursor'
|
||||
|
||||
id = Column(Integer, primary_key=True, default=lambda x: int(time.time()*1000))
|
||||
user = Column(BigInteger, index=True)
|
||||
cursor = Column(BigInteger, default=-1)
|
||||
pending = Column(Boolean, default=False)
|
||||
|
||||
def make_session(url):
|
||||
engine = create_engine(url)#, echo=True)
|
||||
Base.metadata.create_all(engine)
|
||||
Session = sessionmaker(bind=engine)
|
||||
session = Session()
|
||||
return session
|
||||
|
||||
|
||||
|
||||
def test(db='sqlite:///users.db'):
|
||||
|
||||
from sqlalchemy import exists
|
||||
session = make_session(db)
|
||||
|
||||
our_user = session.query(User).first()
|
||||
|
||||
print(our_user.name)
|
||||
print(session.query(User).count())
|
||||
fake_user = User(name="Fake user")
|
||||
session.add(fake_user)
|
||||
session.commit()
|
||||
print(session.query(User).count())
|
||||
print(session.query(exists().where(User.name == "Fake user")).scalar())
|
||||
fake_committed = session.query(User).filter_by(name="Fake user").first()
|
||||
print(fake_committed.id)
|
||||
print(fake_committed.name)
|
||||
session.delete(fake_committed)
|
||||
session.commit()
|
||||
print(session.query(User).count())
|
||||
print(list(session.execute('SELECT 1 from users where id=\'%s\'' % 1548)))
|
209
bitter/utils.py
Normal file
209
bitter/utils.py
Normal file
@ -0,0 +1,209 @@
|
||||
import logging
|
||||
import time
|
||||
import json
|
||||
|
||||
import signal
|
||||
import sys
|
||||
import sqlalchemy
|
||||
|
||||
from itertools import islice
|
||||
from twitter import TwitterHTTPError
|
||||
|
||||
from bitter.models import Following, User, ExtractorEntry, make_session
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def signal_handler(signal, frame):
|
||||
logger.info('You pressed Ctrl+C!')
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def get_users(wq, ulist, by_name=False, queue=None, max_users=100):
|
||||
t = 'name' if by_name else 'uid'
|
||||
logger.debug('Getting users by {}: {}'.format(t, ulist))
|
||||
ilist = iter(ulist)
|
||||
while True:
|
||||
userslice = ",".join(islice(ilist, max_users))
|
||||
if not userslice:
|
||||
break
|
||||
try:
|
||||
if by_name:
|
||||
resp = wq.users.lookup(screen_name=userslice)
|
||||
else:
|
||||
resp = wq.users.lookup(user_id=userslice)
|
||||
except TwitterHTTPError as ex:
|
||||
if ex.e.code in (404,):
|
||||
resp = []
|
||||
else:
|
||||
raise
|
||||
if not resp:
|
||||
logger.debug('Empty response')
|
||||
for user in resp:
|
||||
user = trim_user(user)
|
||||
if queue:
|
||||
queue.put(user)
|
||||
else:
|
||||
yield user
|
||||
|
||||
def trim_user(user):
|
||||
if 'status' in user:
|
||||
del user['status']
|
||||
if 'follow_request_sent' in user:
|
||||
del user['follow_request_sent']
|
||||
if 'created_at' in user:
|
||||
ts = time.strftime('%s', time.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))
|
||||
user['created_at_stamp'] = ts
|
||||
del user['created_at']
|
||||
user['entities'] = json.dumps(user['entities'])
|
||||
return user
|
||||
|
||||
|
||||
def add_user(session, user, enqueue=False):
|
||||
user = trim_user(user)
|
||||
olduser = session.query(User).filter(User.id==user['id'])
|
||||
if olduser:
|
||||
olduser.delete()
|
||||
user = User(**user)
|
||||
session.add(user)
|
||||
if extract:
|
||||
logging.debug('Adding entry')
|
||||
entry = session.query(ExtractorEntry).filter(ExtractorEntry.user==user.id).first()
|
||||
if not entry:
|
||||
entry = ExtractorEntry(user=user.id)
|
||||
session.add(entry)
|
||||
logging.debug(entry.pending)
|
||||
entry.pending = True
|
||||
entry.cursor = -1
|
||||
session.commit()
|
||||
|
||||
|
||||
# TODO: adapt to the crawler
|
||||
def extract(wq, recursive=False, user=None, initfile=None, dburi=None, extractor_name=None):
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
w = wq.next()
|
||||
if not dburi:
|
||||
dburi = 'sqlite:///%s.db' % extractor_name
|
||||
|
||||
session = make_session(dburi)
|
||||
|
||||
if initfile:
|
||||
screen_names = []
|
||||
user_ids = []
|
||||
if not user:
|
||||
logger.info("No user. I will open %s" % initfile)
|
||||
with open(initfile, 'r') as f:
|
||||
for line in f:
|
||||
user = line.strip().split(',')[0]
|
||||
try:
|
||||
int(user)
|
||||
user_ids.append(user)
|
||||
except ValueError:
|
||||
screen_names.append(user.split('@')[-1])
|
||||
else:
|
||||
try:
|
||||
user_ids.append(int(user))
|
||||
logger.info("Added id")
|
||||
except Exception as ex:
|
||||
logger.info("Exception: {}".format(ex))
|
||||
logger.info("Added screen_name")
|
||||
screen_names.append(user)
|
||||
nusers = list(get_users(wq, screen_names, by_name=True))
|
||||
if user_ids:
|
||||
nusers += list(get_users(wq, user_ids, by_name=False))
|
||||
|
||||
for i in nusers:
|
||||
add_user(session, i, enqueue=True)
|
||||
else:
|
||||
logger.info('Using pending users from last session')
|
||||
|
||||
|
||||
total_users = session.query(sqlalchemy.func.count(User.id)).scalar()
|
||||
logging.info('Total users: {}'.format(total_users))
|
||||
def pending_entries():
|
||||
pending = session.query(ExtractorEntry).filter(ExtractorEntry.pending == True).count()
|
||||
logging.info('Pending: {}'.format(pending))
|
||||
return pending
|
||||
|
||||
while pending_entries() > 0:
|
||||
logger.info("Using account: %s" % w.name)
|
||||
candidate, entry = session.query(User, ExtractorEntry).\
|
||||
filter(ExtractorEntry.user == User.id).\
|
||||
filter(ExtractorEntry.pending == True).\
|
||||
order_by(User.followers_count).first()
|
||||
if not candidate:
|
||||
break
|
||||
pending = True
|
||||
cursor = entry.cursor
|
||||
uid = candidate.id
|
||||
uobject = session.query(User).filter(User.id==uid).first()
|
||||
name = uobject.screen_name if uobject else None
|
||||
|
||||
logger.info("#"*20)
|
||||
logger.info("Getting %s - %s" % (uid, name))
|
||||
logger.info("Cursor %s" % cursor)
|
||||
logger.info("Pending: %s/%s" % (session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).count(), total_users))
|
||||
try:
|
||||
resp = wq.followers.ids(user_id=uid, cursor=cursor)
|
||||
except TwitterHTTPError as ex:
|
||||
if ex.e.code in (401, ):
|
||||
logger.info('Not authorized for user: {}'.format(uid))
|
||||
resp = {}
|
||||
if 'ids' in resp:
|
||||
logger.info("New followers: %s" % len(resp['ids']))
|
||||
if recursive:
|
||||
newusers = get_users(wq, resp)
|
||||
for user in newusers:
|
||||
add_user(session, newuser, enqueue=True)
|
||||
for i in resp['ids']:
|
||||
existing_user = session.query(Following).\
|
||||
filter(Following.isfollowed==uid).\
|
||||
filter(Following.follower==i).first()
|
||||
now = int(time.time())
|
||||
if existing_user:
|
||||
existing_user.created_at_stamp = now
|
||||
else:
|
||||
f = Following(isfollowed=uid,
|
||||
follower=i,
|
||||
created_at_stamp=now)
|
||||
session.add(f)
|
||||
|
||||
total_followers = candidate.followers_count
|
||||
fetched_followers = session.query(Following).filter(Following.isfollowed==uid).count()
|
||||
logger.info("Fetched: %s/%s followers" % (fetched_followers,
|
||||
total_followers))
|
||||
cursor = resp["next_cursor"]
|
||||
if cursor > 0:
|
||||
pending = True
|
||||
logger.info("Getting more followers for %s" % uid)
|
||||
else:
|
||||
logger.info("Done getting followers for %s" % uid)
|
||||
cursor = -1
|
||||
pending = False
|
||||
else:
|
||||
logger.info("Error with id %s %s" % (uid, resp))
|
||||
pending = False
|
||||
|
||||
entry.pending = pending
|
||||
entry.cursor = cursor
|
||||
logging.debug('Entry: {} - {}'.format(entry.user, entry.pending))
|
||||
|
||||
session.add(candidate)
|
||||
session.commit()
|
||||
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
def get_tweet(c, tid):
|
||||
return c.statuses.show(id=tid)
|
||||
|
||||
def search_tweet(c, query):
|
||||
return c.search.tweets(q=query)
|
||||
|
||||
def get_user(c, user):
|
||||
try:
|
||||
int(user)
|
||||
return c.users.lookup(user_id=user)[0]
|
||||
except ValueError:
|
||||
return c.users.lookup(screen_name=user)[0]
|
4
requirements.txt
Normal file
4
requirements.txt
Normal file
@ -0,0 +1,4 @@
|
||||
sqlalchemy
|
||||
sqlite3
|
||||
twitter
|
||||
click
|
38
setup.py
Normal file
38
setup.py
Normal file
@ -0,0 +1,38 @@
|
||||
import pip
|
||||
from setuptools import setup
|
||||
from pip.req import parse_requirements
|
||||
|
||||
# parse_requirements() returns generator of pip.req.InstallRequirement objects
|
||||
# pip 6 introduces the *required* session argument
|
||||
try:
|
||||
install_reqs = parse_requirements("requirements.txt", session=pip.download.PipSession())
|
||||
test_reqs = parse_requirements("test-requirements.txt", session=pip.download.PipSession())
|
||||
except AttributeError:
|
||||
install_reqs = parse_requirements("requirements.txt")
|
||||
test_reqs = parse_requirements("test-requirements.txt")
|
||||
|
||||
# reqs is a list of requirement
|
||||
# e.g. ['django==1.5.1', 'mezzanine==1.4.6']
|
||||
install_reqs = [str(ir.req) for ir in install_reqs]
|
||||
test_reqs = [str(ir.req) for ir in test_reqs]
|
||||
|
||||
|
||||
setup(
|
||||
name="bitter",
|
||||
packages=['bitter'],
|
||||
description='''
|
||||
Simplifying how researchers access Data.
|
||||
It includes a CLI and a library.
|
||||
''',
|
||||
author='J. Fernando Sanchez',
|
||||
author_email='balkian@gmail.com',
|
||||
url="http://balkian.com",
|
||||
version="0.2",
|
||||
install_requires=install_reqs,
|
||||
tests_require=test_reqs,
|
||||
include_package_data=True,
|
||||
entry_points="""
|
||||
[console_scripts]
|
||||
bitter=bitter.cli:main
|
||||
"""
|
||||
)
|
0
test-requirements.txt
Normal file
0
test-requirements.txt
Normal file
9
tests/test_crawlers.py
Normal file
9
tests/test_crawlers.py
Normal file
@ -0,0 +1,9 @@
|
||||
from unittest import TestCase
|
||||
|
||||
from bitter.crawlers import TwitterWorker, TwitterQueue
|
||||
|
||||
class TestWorker(TestCase):
|
||||
|
||||
|
||||
def test_worker(self):
|
||||
w = TwitterWorker()
|
6
tests/test_models.py
Normal file
6
tests/test_models.py
Normal file
@ -0,0 +1,6 @@
|
||||
from unittests import TestCase
|
||||
|
||||
class TestModels(TestCase):
|
||||
|
||||
def test_worker(self):
|
||||
assert True
|
7
tests/test_utils.py
Normal file
7
tests/test_utils.py
Normal file
@ -0,0 +1,7 @@
|
||||
from unittest import TestCase
|
||||
|
||||
class TestUtils(TestCase):
|
||||
|
||||
def test_get_user(self):
|
||||
assert True
|
||||
|
Loading…
Reference in New Issue
Block a user