From 5199d5b5aac5b3e1647fd469cff05cfb73e6127d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= Date: Tue, 20 Mar 2018 13:29:18 +0100 Subject: [PATCH] Improve CLI. Add credentials --- README.md | 82 +++++++++++++++++++++++- bitter/VERSION | 2 +- bitter/cli.py | 98 +++++++++++++++++++++------- bitter/crawlers.py | 10 +-- bitter/utils.py | 152 +++++++++++++++++++++++++++++--------------- tests/test_utils.py | 4 +- 6 files changed, 266 insertions(+), 82 deletions(-) diff --git a/README.md b/README.md index 0a9a2e2..37df3a0 100644 --- a/README.md +++ b/README.md @@ -32,9 +32,89 @@ bitter api statuses/user_timeline --id thepsf --count 500 ``` +## Adding credentials + +``` +bitter --config credentials add +``` + +You can specify the parameters in the command or let the command line guide you through the process. + # Examples -The CLI can query the rest API: +## Downloading a list of tweets + +Bitter can download tweets from a list of tweets in a CSV file. +The result is stored as individual json files in your folder of choice. +You can even specify the column number for tweet ids. +Bitter will not try to download + +``` +Usage: bitter tweet get_all [OPTIONS] TWEETSFILE + + Download tweets from a list of tweets in a CSV file. The result is stored + as individual json files in your folder of choice. + +Options: + -f, --folder TEXT + -d, --delimiter TEXT + -h, --header Discard the first line (use it as a header) + -q, --quotechar TEXT + -c, --column INTEGER + --help Show this message and exit. + +``` + +For instance, this will download `tweet_ids.csv` in the `tweet_info` folder: + +``` +bitter tweet get_all -f tweet_info tweet_ids.csv +``` + +## Downloading a list of users + +Bitter downloads users and tweets in a similar way: + +``` +Usage: bitter users get_all [OPTIONS] USERSFILE + + Download users from a list of user ids/screen names in a CSV file. The + result is stored as individual json files in your folder of choice. + +Options: + -f, --folder TEXT + -d, --delimiter TEXT + -h, --header Discard the first line (use it as a header) + -q, --quotechar TEXT + -c, --column INTEGER + --help Show this message and exit. +``` + +The only difference is that users can be downloaded via `screen_name` or `user_id`. +This method does not try to resolve screen names to user ids, so users may be downloaded more than once if they appear in both ways. + +## Downloading a stream + +``` +Usage: bitter stream get [OPTIONS] + +Options: + -l, --locations TEXT + -t, --track TEXT + -f, --file TEXT File to store the stream of tweets. Default: standard output + -p, --politelyretry Politely retry after a hangup/connection error + --help Show this message and exit. +``` + +``` +bitter --config .bitter.yaml stream get +``` +python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines + + +## REST queries + +In newer versions of bitter, individual methods to download tweets/users using the REST API are being replaced with a generic method to call the API. ``` bitter api --parameter VALUE ... | [--tweets | --users] [--max_count MAX_COUNT] [--count COUNT_PER_CALL] diff --git a/bitter/VERSION b/bitter/VERSION index a3df0a6..ac39a10 100644 --- a/bitter/VERSION +++ b/bitter/VERSION @@ -1 +1 @@ -0.8.0 +0.9.0 diff --git a/bitter/cli.py b/bitter/cli.py index 92d395a..493bcb9 100644 --- a/bitter/cli.py +++ b/bitter/cli.py @@ -41,6 +41,32 @@ def main(ctx, verbose, logging_level, config, credentials): if os.path.exists(utils.get_config_path(credentials)): utils.copy_credentials_to_config(credentials, config) + +@main.group() +@click.pass_context +def credentials(ctx): + pass + +@credentials.command('add') +@click.option('--consumer_key', default=None) +@click.option('--consumer_secret', default=None) +@click.option('--token_key', default=None) +@click.option('--token_secret', default=None) +@click.argument('user_name') +def add(user_name, consumer_key, consumer_secret, token_key, token_secret): + if not consumer_key: + consumer_key = click.prompt('Please, enter your YOUR CONSUMER KEY') + if not consumer_secret: + consumer_secret = click.prompt('Please, enter your CONSUMER SECRET') + if not token_key: + token_key = click.prompt('Please, enter your ACCESS TOKEN') + if not token_secret: + token_secret = click.prompt('Please, enter your ACCESS TOKEN SECRET') + utils.add_credentials(conffile=bconf.CONFIG_FILE, user=user_name, consumer_key=consumer_key, consumer_secret=consumer_secret, + token_key=token_key, token_secret=token_secret) + click.echo('Credentials added for {}'.format(user_name)) + + @main.group() @click.pass_context def tweet(ctx): @@ -52,22 +78,36 @@ def tweet(ctx): @click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False) @click.argument('tweetid') def get_tweet(tweetid, write, folder, update): - wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) + wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) utils.download_tweet(wq, tweetid, write, folder, update) - -@tweet.command('get_all') + +@tweet.command('get_all', help='''Download tweets from a list of tweets in a CSV file. +The result is stored as individual json files in your folder of choice.''') @click.argument('tweetsfile', 'File with a list of tweets to look up') @click.option('-f', '--folder', default="tweets") +@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!') +@click.option('-d', '--delimiter', default=",") +@click.option('-h', '--header', help='Discard the first line (use it as a header)', + is_flag=True, default=False) +@click.option('-q', '--quotechar', default='"') +@click.option('-c', '--column', type=int, default=0) @click.pass_context -def get_tweets(ctx, tweetsfile, folder): - wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) - utils.download_tweets(wq, tweetsfile, folder) +def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, column): + if update and not click.confirm('This may overwrite existing tweets. Continue?'): + click.echo('Cancelling') + return + wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) + for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter, + batch_method=utils.tweet_download_batch, + header=header, quotechar=quotechar, + column=column, update=update): + pass @tweet.command('search') @click.argument('query') @click.pass_context def search(ctx, query): - wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) + wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) t = utils.search_tweet(wq, query) print(json.dumps(t, indent=2)) @@ -75,7 +115,7 @@ def search(ctx, query): @click.argument('user') @click.pass_context def timeline(ctx, user): - wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) + wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) t = utils.user_timeline(wq, user) print(json.dumps(t, indent=2)) @@ -101,7 +141,7 @@ def list_users(ctx, db): @click.option('-f', '--folder', default="users") @click.option('-u', '--update', help="Update the file even if the user exists", is_flag=True, default=False) def get_user(user, write, folder, update): - wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) + wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) if not write: u = utils.get_user(wq, user) js = json.dumps(u, indent=2) @@ -118,15 +158,28 @@ def get_user(user, write, folder, update): js = json.dumps(u, indent=2) print(js, file=f) -@users.command('get_all') +@users.command('get_all', help='''Download users from a list of user ids/screen names in a CSV file. + The result is stored as individual json files in your folder of choice.''') @click.argument('usersfile', 'File with a list of users to look up') @click.option('-f', '--folder', default="users") +@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!') +@click.option('-d', '--delimiter', default=",") +@click.option('-h', '--header', help='Discard the first line (use it as a header)', + is_flag=True, default=False) +@click.option('-q', '--quotechar', default='"') +@click.option('-c', '--column', type=int, default=0) @click.pass_context -def get_users(ctx, usersfile, folder): - with open(usersfile) as f: - for line in f: - uid = line.strip() - ctx.invoke(get_user, folder=folder, user=uid, write=True) +def get_users(ctx, usersfile, folder, update, delimiter, header, quotechar, column): + if update and not click.confirm('This may overwrite existing users. Continue?'): + click.echo('Cancelling') + return + wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) + for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter, + batch_method=utils.user_download_batch, + update=update, + header=header, quotechar=quotechar, + column=column): + pass @users.command('crawl') @click.option('--db', required=True, help='Database to save all users.') @@ -147,7 +200,7 @@ def crawl_users(ctx, usersfile, skip, until, threads, db): return ExitStack() - wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) + wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) logger.info('Starting Network crawler with {} threads and {} credentials.'.format(threads, len(wq.queue))) @@ -311,7 +364,7 @@ def users_extractor(ctx): @click.pass_context def extract(ctx, recursive, user, name, initfile): print(locals()) - wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) + wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) dburi = ctx.obj['DBURI'] utils.extract(wq, recursive=recursive, @@ -323,7 +376,7 @@ def extract(ctx, recursive, user, name, initfile): @extractor.command('reset') @click.pass_context def reset_extractor(ctx): - wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) + wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) db = ctx.obj['DBURI'] session = make_session(db) session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False}) @@ -332,7 +385,7 @@ def reset_extractor(ctx): @click.argument('url', required=False) @click.pass_context def get_limits(ctx, url): - wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) + wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) total = {} for worker in wq.queue: resp = worker.client.application.rate_limit_status() @@ -357,7 +410,8 @@ def get_limits(ctx, url): -@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False)) +@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False), + help='''Issue a call to an endpoint of the Twitter API.''') @click.argument('cmd', nargs=1) @click.option('--tweets', is_flag=True, help='Fetch more tweets using smart pagination. Use --count to control how many tweets to fetch per call, and --max_count to set the number of desired tweets (or -1 to get as many as possible).', type=bool, default=False) @click.option('--users', is_flag=True, help='Fetch more users using smart pagination. Use --count to control how many users to fetch per call, and --max_count to set the number of desired users (or -1 to get as many as possible).', type=bool, default=False) @@ -374,7 +428,7 @@ def api(ctx, cmd, tweets, users, api_args): if k in mappings: k = mappings[k] opts[k] = v - wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) + wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) if tweets: resp = utils.consume_tweets(wq[cmd], **opts) elif users: @@ -409,7 +463,7 @@ def stream(ctx): @click.option('-p', '--politelyretry', help='Politely retry after a hangup/connection error', is_flag=True, default=True) @click.pass_context def get_stream(ctx, locations, track, file, politelyretry): - wq = crawlers.StreamQueue.from_config(bconf.CONFIG_FILE, 1) + wq = crawlers.StreamQueue.from_config(conffile=bconf.CONFIG_FILE, max_workers=1) query_args = {} if locations: diff --git a/bitter/crawlers.py b/bitter/crawlers.py index 8ebd0c8..399394b 100644 --- a/bitter/crawlers.py +++ b/bitter/crawlers.py @@ -61,12 +61,14 @@ class FromCredentialsMixin(object): class FromConfigMixin(object): @classmethod - def from_config(cls, conffile=None, max_workers=None): + def from_config(cls, config=None, conffile=None, max_workers=None): wq = cls() - with utils.config(conffile) as c: - for cred in islice(c['credentials'], max_workers): - wq.ready(cls.worker_class(cred["user"], cred)) + if not config: + with utils.config(conffile) as c: + config = c + for cred in islice(config['credentials'], max_workers): + wq.ready(cls.worker_class(cred["user"], cred)) return wq class TwitterWorker(object): diff --git a/bitter/utils.py b/bitter/utils.py index d520e39..02f9368 100644 --- a/bitter/utils.py +++ b/bitter/utils.py @@ -4,6 +4,7 @@ import logging import time import json import yaml +import csv import io import signal @@ -93,7 +94,7 @@ def read_config(conffile): p = conffile and get_config_path(conffile) if p: if not os.path.exists(p): - raise Exception('{} file does not exist.'.format(p)) + raise IOError('{} file does not exist.'.format(p)) f = open(p, 'r') elif 'BITTER_CONFIG' not in os.environ: raise Exception('No config file or BITTER_CONFIG env variable.') @@ -103,6 +104,8 @@ def read_config(conffile): def write_config(conf, conffile=None): + if not conf: + conf = {'credentials': []} if conffile: p = get_config_path(conffile) with open(p, 'w') as f: @@ -122,6 +125,7 @@ def create_config_file(conffile=None): conffile = get_config_path(conffile) with open(conffile, 'a'): pass + write_config(None, conffile) def get_credentials(conffile=None, inverse=False, **kwargs): @@ -142,7 +146,11 @@ def delete_credentials(conffile=None, **creds): def add_credentials(conffile=None, **creds): - exist = get_credentials(conffile, **creds) + try: + exist = get_credentials(conffile, **creds) + except IOError: + exist = False + create_config_file(conffile) if exist: return with config(conffile) as c: @@ -451,86 +459,128 @@ def get_user(c, user): return c.users.lookup(screen_name=user)[0] def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False): - cached = cached_tweet(tweetid, folder) + cached = cached_id(tweetid, folder) tweet = None if update or not cached: tweet = get_tweet(wq, tweetid) - js = json.dumps(tweet, indent=2) + js = json.dumps(tweet) if write: if tweet: - write_tweet_json(js, folder) + write_json(js, folder) else: print(js) -def cached_tweet(tweetid, folder): +def cached_id(oid, folder): tweet = None - file = os.path.join(folder, '%s.json' % tweetid) + file = os.path.join(folder, '%s.json' % oid) if os.path.exists(file) and os.path.isfile(file): try: - # print('%s: Tweet exists' % tweetid) + # print('%s: Object exists' % oid) with open(file) as f: tweet = json.load(f) except Exception as ex: - logger.error('Error getting cached version of {}: {}'.format(tweetid, ex)) + logger.error('Error getting cached version of {}: {}'.format(oid, ex)) return tweet -def write_tweet_json(js, folder): - tweetid = js['id'] - file = tweet_file(tweetid, folder) +def write_json(js, folder, oid=None): + if not oid: + oid = js['id'] + file = id_file(oid, folder) if not os.path.exists(folder): os.makedirs(folder) with open(file, 'w') as f: - json.dump(js, f, indent=2) - logger.info('Written {} to file {}'.format(tweetid, file)) + json.dump(js, f) + logger.info('Written {} to file {}'.format(oid, file)) -def tweet_file(tweetid, folder): - return os.path.join(folder, '%s.json' % tweetid) +def id_file(oid, folder): + return os.path.join(folder, '%s.json' % oid) -def tweet_fail_file(tweetid, folder): +def fail_file(oid, folder): failsfolder = os.path.join(folder, 'failed') if not os.path.exists(failsfolder): os.makedirs(failsfolder) - return os.path.join(failsfolder, '%s.failed' % tweetid) + return os.path.join(failsfolder, '%s.failed' % oid) -def tweet_failed(tweetid, folder): - return os.path.isfile(tweet_fail_file(tweetid, folder)) +def id_failed(oid, folder): + return os.path.isfile(fail_file(oid, folder)) -def download_tweets(wq, tweetsfile, folder, update=False, retry_failed=False, ignore_fails=True): - def filter_line(line): - tweetid = int(line) - # print('Checking {}'.format(tweetid)) - if (cached_tweet(tweetid, folder) and not update) or (tweet_failed(tweetid, folder) and not retry_failed): +def tweet_download_batch(wq, batch): + tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id'] + return tweets.items() + +def user_download_batch(wq, batch): + screen_names = [] + user_ids = [] + for elem in batch: + try: + user_ids.append(int(elem)) + except ValueError: + screen_names.append(elem) + print('Downloading: {} - {}'.format(user_ids, screen_names)) + users = wq.users.lookup(user_id=",".join(user_ids), screen_name=",".join(screen_names)) + found_ids = [] + found_names = [] + for user in users: + uid = user['id'] + if uid in user_ids: + found_ids.append(uid) + yield (uid, user) + uname = user['screen_name'] + if uname in screen_names: + found_names.append(uname) + yield (uname, user) + for uid in set(user_ids) - set(found_ids): + yield (uid, None) + for name in set(screen_names) - set(found_names): + yield (name, None) + + +def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=True, + batch_method=tweet_download_batch): + def filter_lines(line): + # print('Checking {}'.format(line)) + oid = line[0] + if (cached_id(oid, folder) and not update) or (id_failed(oid, folder) and not retry_failed): yield None else: - yield line + yield str(oid) def print_result(res): - tid, tweet = res - if tweet: - try: - write_tweet_json(tweet, folder=folder) - yield 1 - except Exception as ex: - logger.error('%s: %s' % (tid, ex)) - if not ignore_fails: - raise - else: - logger.info('Tweet not recovered: {}'.format(tid)) - with open(tweet_fail_file(tid, folder), 'w') as f: - print('Tweet not found', file=f) - yield -1 - - def download_batch(batch): - tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id'] - return tweets.items() - - with open(tweetsfile) as f: - lines = map(lambda x: x.strip(), f) - lines_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_line, lines), desc='Total lines')) - tweets = parallel(download_batch, lines_to_crawl, 100) - for res in tqdm(parallel(print_result, tweets), desc='Queried'): - pass + for oid, obj in res: + if obj: + try: + write_json(obj, folder=folder, oid=oid) + yield 1 + except Exception as ex: + logger.error('%s: %s' % (oid, ex)) + if not ignore_fails: + raise + else: + logger.info('Object not recovered: {}'.format(oid)) + with open(fail_file(oid, folder), 'w') as f: + print('Object not found', file=f) + yield -1 + + objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects')) + batch_method = partial(batch_method, wq) + tweets = parallel(batch_method, objects_to_crawl, 100) + for res in tqdm(parallel(print_result, tweets), desc='Queried'): + yield res + + +def download_file(wq, csvfile, folder, column=0, delimiter=',', + header=False, quotechar='"', batch_method=tweet_download_batch, + **kwargs): + with open(csvfile) as f: + csvreader = csv.reader(f, delimiter=str(delimiter), quotechar=str(quotechar)) + if header: + next(csvreader) + tweets = map(lambda row: row[0].strip(), csvreader) + for res in download_list(wq, tweets, folder, batch_method=batch_method, + **kwargs): + yield res + def download_timeline(wq, user): return wq.statuses.user_timeline(id=user) diff --git a/tests/test_utils.py b/tests/test_utils.py index e052ba0..417f1d4 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -54,7 +54,7 @@ class TestUtils(TestCase): toc = time.time() assert (tic-toc) < 600 resp2 = utils.parallel(echo, [1,2,3,4], chunksize=2) - assert list(resp2) == [1,2,3,4] + assert list(resp2) == [1,2, 3,4] class TestUtilsEnv(TestUtils): @@ -68,5 +68,3 @@ class TestUtilsEnv(TestUtils): def tearDown(self): if hasattr(self, 'oldenv'): os.environ['BITTER_CONFIG'] = self.oldenv - -