1
0
mirror of https://github.com/balkian/bitter.git synced 2024-12-22 00:18:12 +00:00

API command

* Added API command
* Fixed bug in chunk
This commit is contained in:
J. Fernando Sánchez 2017-11-30 16:49:42 +01:00
parent e65f6836b3
commit cf766a6bf3
6 changed files with 282 additions and 121 deletions

View File

@ -22,6 +22,13 @@ wq = easy()
print(wq.users.show(user_name='balkian'))
```
You can also make custom calls to the API through the command line.
e.g. to get the latest 500 tweets by the python software foundation:
```
bitter api statuses/user_timeline --id thepsf --count 500
```
# Credentials format
```

View File

@ -1 +1 @@
0.7.2
0.7.4

View File

@ -240,11 +240,6 @@ def crawl_users(ctx, usersfile, skip, until, threads, db):
logger.info('Done!')
@main.group('api')
def api():
pass
@main.group('extractor')
@click.pass_context
@click.option('--db', required=True, help='Database of users.')
@ -332,7 +327,7 @@ def reset_extractor(ctx):
session = make_session(db)
session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})
@api.command('limits')
@main.command('limits')
@click.argument('url', required=False)
@click.pass_context
def get_limits(ctx, url):
@ -353,6 +348,32 @@ def get_limits(ctx, url):
else:
print(json.dumps(resp, indent=2))
@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False))
@click.argument('cmd', nargs=1)
@click.argument('api_args', nargs=-1, type=click.UNPROCESSED)
@click.pass_context
def api(ctx, cmd, api_args):
opts = {}
i = iter(api_args)
for k, v in zip(i, i):
k = k.replace('--', '')
opts[k] = v
wq = crawlers.TwitterQueue.from_credentials(bconf.CREDENTIALS)
resp = utils.consume_feed(wq[cmd], **opts)
# A hack to stream jsons
print('[')
first = True
for i in resp:
if not first:
print(',')
else:
first = False
print(json.dumps(i, indent=2))
print(']')
@main.command('server')
@click.argument('CONSUMER_KEY', required=True)
@click.argument('CONSUMER_SECRET', required=True)

View File

@ -10,6 +10,7 @@ from twitter import *
from collections import OrderedDict
from threading import Lock
from itertools import islice
from functools import partial
try:
import itertools.ifilter as filter
except ImportError:
@ -38,6 +39,9 @@ class AttrToFunc(object):
else:
return extend_call(k)
def __getitem__(self, k):
return partial(self.handler, self.__uriparts+k.split('/'))
def __call__(self, *args, **kwargs):
# for i, a in enumerate(args)e
# kwargs[i] = a
@ -75,6 +79,12 @@ class TwitterWorker(object):
self._client = self.api_class(auth=auth)
return self._client
def __repr__(self):
msg = '<{} for {}>'.format(self.__class__.__name__, self.name)
if self.busy:
msg += ' [busy]'
return msg
class RestWorker(TwitterWorker):
api_class = Twitter

View File

@ -3,6 +3,7 @@ import json
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.types import BigInteger, Integer, Text, Boolean
from sqlalchemy.pool import SingletonThreadPool
from sqlalchemy import Column, Index
from sqlalchemy import create_engine
@ -85,16 +86,20 @@ class ExtractorEntry(Base):
user = Column(BigInteger, index=True)
cursor = Column(BigInteger, default=-1)
pending = Column(Boolean, default=False)
errors = Column(Text, default="")
busy = Column(Boolean, default=False)
def make_session(url):
engine = create_engine(url)#, echo=True)
if not isinstance(url, str):
print(url)
raise Exception("FUCK")
engine = create_engine(url, poolclass=SingletonThreadPool)#, echo=True)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
return session
def test(db='sqlite:///users.db'):
from sqlalchemy import exists

View File

@ -11,16 +11,13 @@ import os
import multiprocessing
from multiprocessing.pool import ThreadPool
from functools import partial
from tqdm import tqdm
from itertools import islice, chain
from contextlib import contextmanager
try:
from itertools import izip_longest
except ImportError:
from itertools import zip_longest
from collections import Counter
from builtins import map, filter
@ -38,16 +35,20 @@ def signal_handler(signal, frame):
logger.info('You pressed Ctrl+C!')
sys.exit(0)
def chunk(iterable, n):
it = iter(iterable)
return iter(lambda: tuple(islice(it, n)), ())
def parallel(func, source, chunksize=1, numcpus=multiprocessing.cpu_count()):
source = chunk(source, chunksize)
p = ThreadPool(numcpus*2)
for i in chain.from_iterable(p.imap_unordered(func, source, int(1000/numcpus))):
results = p.imap_unordered(func, source, chunksize=int(1000/numcpus))
for i in chain.from_iterable(results):
yield i
def get_credentials_path(credfile=None):
if not credfile:
if config.CREDENTIALS:
@ -56,17 +57,20 @@ def get_credentials_path(credfile=None):
raise Exception('No valid credentials file')
return os.path.expanduser(credfile)
@contextmanager
def credentials_file(credfile, *args, **kwargs):
p = get_credentials_path(credfile)
with open(p, *args, **kwargs) as f:
yield f
def iter_credentials(credfile=None):
with credentials_file(credfile) as f:
for l in f:
yield json.loads(l.strip())
def get_credentials(credfile=None, inverse=False, **kwargs):
creds = []
for i in iter_credentials(credfile):
@ -77,11 +81,13 @@ def get_credentials(credfile=None, inverse=False, **kwargs):
creds.append(i)
return creds
def create_credentials(credfile=None):
credfile = get_credentials_path(credfile)
with credentials_file(credfile, 'a'):
pass
def delete_credentials(credfile=None, **creds):
tokeep = get_credentials(credfile, inverse=True, **creds)
with credentials_file(credfile, 'w') as f:
@ -89,6 +95,7 @@ def delete_credentials(credfile=None, **creds):
f.write(json.dumps(i))
f.write('\n')
def add_credentials(credfile=None, **creds):
exist = get_credentials(credfile, **creds)
if not exist:
@ -103,6 +110,7 @@ def get_hashtags(iter_tweets, best=None):
c.update(tag['text'] for tag in tweet.get('entities', {}).get('hashtags', {}))
return c
def read_file(filename, tail=False):
with open(filename) as f:
while True:
@ -144,6 +152,7 @@ def get_users(wq, ulist, by_name=False, queue=None, max_users=100):
else:
yield user
def trim_user(user):
if 'status' in user:
del user['status']
@ -157,14 +166,22 @@ def trim_user(user):
return user
def add_user(session, user, enqueue=False):
def add_user(user, dburi=None, session=None, update=False):
if not session:
session = make_session(dburi)
user = trim_user(user)
olduser = session.query(User).filter(User.id==user['id'])
olduser = session.query(User).filter(User.id == user['id'])
if olduser:
if not update:
return
olduser.delete()
user = User(**user)
session.add(user)
if extract:
nuser = User()
for key, value in user.items():
setattr(nuser, key, value)
user = nuser
if update:
session.add(user)
logger.debug('Adding entry')
entry = session.query(ExtractorEntry).filter(ExtractorEntry.user==user.id).first()
if not entry:
@ -174,125 +191,193 @@ def add_user(session, user, enqueue=False):
entry.pending = True
entry.cursor = -1
session.commit()
session.close()
def download_entry(wq, entry_id, dburi=None, recursive=False):
session = make_session(dburi)
if not session:
raise Exception("Provide dburi or session")
logger.info("Downloading entry: %s (%s)" % (entry_id, type(entry_id)))
entry = session.query(ExtractorEntry).filter(ExtractorEntry.id==entry_id).first()
user = session.query(User).filter(User.id == entry.user).first()
download_user(wq, session, user, entry, recursive)
session.close()
def download_user(wq, session, user, entry=None, recursive=False, max_followers=50000):
total_followers = user.followers_count
if total_followers > max_followers:
entry.pending = False
logger.info("Too many followers for user: %s" % user.screen_name)
session.add(entry)
session.commit()
return
if not entry:
entry = session.query(ExtractorEntry).filter(ExtractorEntry.user==user.id).first() or ExtractorEntry(user=user.id)
session.add(entry)
session.commit()
pending = True
cursor = entry.cursor
uid = user.id
name = user.name
logger.info("#"*20)
logger.info("Getting %s - %s" % (uid, name))
logger.info("Cursor %s" % cursor)
logger.info("Using account: %s" % wq.name)
_fetched_followers = 0
def fetched_followers():
return session.query(Following).filter(Following.isfollowed==uid).count()
attempts = 0
while cursor > 0 or fetched_followers() < total_followers:
try:
resp = wq.followers.ids(user_id=uid, cursor=cursor)
except TwitterHTTPError as ex:
attempts += 1
if ex.e.code in (401, ) or attempts > 3:
logger.info('Not authorized for user: {}'.format(uid))
entry.errors = ex.message
break
if 'ids' not in resp:
logger.info("Error with id %s %s" % (uid, resp))
entry.pending = False
entry.errors = "No ids in response: %s" % resp
break
logger.info("New followers: %s" % len(resp['ids']))
if recursive:
newusers = get_users(wq, resp)
for newuser in newusers:
add_user(session=session, user=newuser)
if 'ids' not in resp or not resp['ids']:
logger.info('NO IDS in response')
break
for i in resp['ids']:
existing_user = session.query(Following).\
filter(Following.isfollowed == uid).\
filter(Following.follower == i).first()
now = int(time.time())
if existing_user:
existing_user.created_at_stamp = now
else:
f = Following(isfollowed=uid,
follower=i,
created_at_stamp=now)
session.add(f)
logger.info("Fetched: %s/%s followers" % (fetched_followers(),
total_followers))
entry.cursor = resp["next_cursor"]
session.add(entry)
session.commit()
logger.info("Done getting followers for %s" % uid)
entry.pending = False
entry.busy = False
session.add(entry)
session.commit()
logger.debug('Entry: {} - {}'.format(entry.user, entry.pending))
sys.stdout.flush()
def classify_user(id_or_name, screen_names, user_ids):
try:
int(id_or_name)
user_ids.append(id_or_name)
logger.debug("Added user id")
except ValueError:
logger.debug("Added screen_name")
screen_names.append(id_or_name.split('@')[-1])
# TODO: adapt to the crawler
def extract(wq, recursive=False, user=None, initfile=None, dburi=None, extractor_name=None):
signal.signal(signal.SIGINT, signal_handler)
w = wq.next()
if not dburi:
dburi = 'sqlite:///%s.db' % extractor_name
session = make_session(dburi)
session.query(ExtractorEntry).update({ExtractorEntry.busy: False})
session.commit()
screen_names = []
user_ids = []
def classify_user(id_or_name):
try:
int(user)
user_ids.append(user)
logger.info("Added user id")
except ValueError:
logger.info("Added screen_name")
screen_names.append(user.split('@')[-1])
if user:
classify_user(user)
elif initfile:
logger.info("No user. I will open %s" % initfile)
with open(initfile, 'r') as f:
for line in f:
user = line.strip().split(',')[0]
classify_user(user)
else:
if not (user or initfile):
logger.info('Using pending users from last session')
else:
screen_names = []
user_ids = []
if user:
classify_user(user, screen_names, user_ids)
elif initfile:
logger.info("No user. I will open %s" % initfile)
with open(initfile, 'r') as f:
for line in f:
user = line.strip().split(',')[0]
classify_user(user, screen_names, user_ids)
def missing_user(ix, column=User.screen_name):
res = session.query(User).filter(column == ix).count() == 0
if res:
logger.info("Missing user %s. Count: %s" % (ix, res))
return res
nusers = list(get_users(wq, screen_names, by_name=True))
if user_ids:
nusers += list(get_users(wq, user_ids, by_name=False))
screen_names = list(filter(missing_user, screen_names))
user_ids = list(filter(partial(missing_user, column=User.id_str), user_ids))
nusers = []
logger.info("Missing user ids: %s" % user_ids)
logger.info("Missing screen names: %s" % screen_names)
if screen_names:
nusers = list(get_users(wq, screen_names, by_name=True))
if user_ids:
nusers += list(get_users(wq, user_ids, by_name=False))
for i in nusers:
add_user(session, i, enqueue=True)
for i in nusers:
add_user(dburi=dburi, user=i)
total_users = session.query(sqlalchemy.func.count(User.id)).scalar()
logger.info('Total users: {}'.format(total_users))
def pending_entries():
pending = session.query(ExtractorEntry).filter(ExtractorEntry.pending == True).count()
logger.info('Pending: {}'.format(pending))
return pending
while pending_entries() > 0:
logger.info("Using account: %s" % w.name)
de = partial(download_entry, wq, dburi=dburi)
pending = pending_entries(dburi)
session.close()
for i in tqdm(parallel(de, pending), desc='Downloading users', total=total_users):
logger.info("Got %s" % i)
def pending_entries(dburi):
session = make_session(dburi)
while True:
candidate, entry = session.query(User, ExtractorEntry).\
filter(ExtractorEntry.user == User.id).\
filter(ExtractorEntry.pending == True).\
order_by(User.followers_count).first()
if not candidate:
break
pending = True
cursor = entry.cursor
uid = candidate.id
uobject = session.query(User).filter(User.id==uid).first()
name = uobject.screen_name if uobject else None
logger.info("#"*20)
logger.info("Getting %s - %s" % (uid, name))
logger.info("Cursor %s" % cursor)
logger.info("Pending: %s/%s" % (session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).count(), total_users))
try:
resp = wq.followers.ids(user_id=uid, cursor=cursor)
except TwitterHTTPError as ex:
if ex.e.code in (401, ):
logger.info('Not authorized for user: {}'.format(uid))
resp = {}
if 'ids' in resp:
logger.info("New followers: %s" % len(resp['ids']))
if recursive:
newusers = get_users(wq, resp)
for user in newusers:
add_user(session, newuser, enqueue=True)
for i in resp['ids']:
existing_user = session.query(Following).\
filter(Following.isfollowed==uid).\
filter(Following.follower==i).first()
now = int(time.time())
if existing_user:
existing_user.created_at_stamp = now
else:
f = Following(isfollowed=uid,
follower=i,
created_at_stamp=now)
session.add(f)
total_followers = candidate.followers_count
fetched_followers = session.query(Following).filter(Following.isfollowed==uid).count()
logger.info("Fetched: %s/%s followers" % (fetched_followers,
total_followers))
cursor = resp["next_cursor"]
if cursor > 0:
pending = True
logger.info("Getting more followers for %s" % uid)
else:
logger.info("Done getting followers for %s" % uid)
cursor = -1
pending = False
else:
logger.info("Error with id %s %s" % (uid, resp))
pending = False
entry.pending = pending
entry.cursor = cursor
logger.debug('Entry: {} - {}'.format(entry.user, entry.pending))
session.add(candidate)
session.commit()
sys.stdout.flush()
filter(ExtractorEntry.user == User.id).\
filter(ExtractorEntry.pending == True).\
filter(ExtractorEntry.busy == False).\
order_by(User.followers_count).first()
if candidate:
entry.busy = True
session.add(entry)
session.commit()
yield int(entry.id)
continue
if session.query(ExtractorEntry).\
filter(ExtractorEntry.busy == True).count() > 0:
time.sleep(1)
continue
logger.info("No more pending entries")
break
session.close()
def get_tweet(c, tid):
return c.statuses.show(id=tid)
@ -394,3 +479,36 @@ def download_tweets(wq, tweetsfile, folder, update=False, retry_failed=False, ig
tweets = parallel(download_batch, lines_to_crawl, 100)
for res in tqdm(parallel(print_result, tweets), desc='Queried'):
pass
def download_timeline(wq, user):
return wq.statuses.user_timeline(id=user)
def consume_feed(func, *args, **kwargs):
'''
Get all the tweets using pagination and a given method.
It can be controlled with the `count` parameter.
If count < 0 => Loop until the whole feed is consumed.
If count == 0 => Only call the API once, with the default values.
If count > 0 => Get count tweets from the feed.
'''
remaining = int(kwargs.pop('count', 0))
consume = remaining < 0
limit = False
# Simulate a do-while by updating the condition at the end
while not limit:
if remaining > 0:
kwargs['count'] = remaining
resp = func(*args, **kwargs)
if not resp:
return
for t in resp:
yield t
if consume:
continue
remaining -= len(resp)
max_id = min(s['id'] for s in func(*args, **kwargs)) - 1
kwargs['max_id'] = max_id
limit = remaining <= 0