From 030c41b826ae03f3993c558528707eafa14af51a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= Date: Tue, 7 Jan 2020 20:35:29 +0100 Subject: [PATCH] Changes to user and tweet search: Cache by default * Improved printing of credential limits * Tweet and user searchers cache by default. Write has been changed to dry_run --- bitter/VERSION | 2 +- bitter/cli.py | 96 +++++++++++++++++++++++-------------------------- bitter/utils.py | 19 +++++++--- 3 files changed, 60 insertions(+), 57 deletions(-) diff --git a/bitter/VERSION b/bitter/VERSION index 5712157..5eef0f1 100644 --- a/bitter/VERSION +++ b/bitter/VERSION @@ -1 +1 @@ -0.10.1 +0.10.2 diff --git a/bitter/cli.py b/bitter/cli.py index 3c1094a..1cc9106 100644 --- a/bitter/cli.py +++ b/bitter/cli.py @@ -93,6 +93,8 @@ def main(ctx, verbose, logging_level, config, credentials): @main.group(invoke_without_command=True) @click.pass_context def credentials(ctx): + if ctx.invoked_subcommand is not None: + return wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) for worker in wq.queue: print('#'*20) @@ -104,44 +106,47 @@ def credentials(ctx): @credentials.command('limits') -@click.option('--all', type=bool, default=False, required=False, - help=('Print all limits. By default, it only limits that ' +@click.option('--no_aggregate', is_flag=True, default=False, + help=('Print limits from all workers. By default, limits are ' + 'aggregated (summed).')) +@click.option('--no_diff', is_flag=True, default=False, + help=('Print all limits. By default, only limits that ' 'have been consumed will be shown.')) @click.argument('url', required=False) @click.pass_context -def get_limits(ctx, all, url): +def get_limits(ctx, no_aggregate, no_diff, url): wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) - total = {} + limits = {} + if url: + print('URL is: {}'.format(url)) for worker in wq.queue: resp = worker.client.application.rate_limit_status() - print('#'*20) - print(worker.name) - if url: - limit = 'NOT FOUND' - print('URL is: {}'.format(url)) - cat = url.split('/')[1] - if cat in resp['resources']: - limit = resp['resources'][cat].get(url, None) or resp['resources'][cat] - else: - print('Cat {} not found'.format(cat)) - continue - for k in limit: - total[k] = total.get(k, 0) + limit[k] - print('{}: {}'.format(url, limit)) - continue - nres = {} - if not all: - for res, urls in resp['resources'].items(): - nurls = {} - for u, limits in urls.items(): - if limits['limit'] != limits['remaining']: - nurls[u] = limits - if nurls: - nres[res] = nurls - resp = nres - print(json.dumps(resp, indent=2)) - if url: - print('Total for {}: {}'.format(url, total)) + for urlimits in resp['resources'].values(): + for url, value in urlimits.items(): + if url not in limits: + limits[url] = {} + glob = limits[url].get('global', {}) + limits[url][worker.name] = value + for k in ['limit', 'remaining']: + if k not in glob: + glob[k] = 0 + glob[k] += value[k] + limits[url]['global'] = glob + for url, lims in limits.items(): + worker_list = lims.keys() if no_aggregate else ['global', ] + + url_printed = False + + for worker in worker_list: + vals = lims[worker] + consumed = vals['limit'] - vals['remaining'] + if no_diff or consumed: + if not url_printed: + print(url) + url_printed = True + print('\t', worker, ':') + print('\t\t', vals) + @credentials.command('add') @click.option('--consumer_key', default=None) @@ -169,14 +174,14 @@ def tweet(ctx): pass @tweet.command('get') -@click.option('-w', '--write', is_flag=True, default=False) +@click.option('-d', '--dry_run', is_flag=True, default=False) @click.option('-f', '--folder', default="tweets") @click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False) @click.argument('tweetid') @serialize -def get_tweet(tweetid, write, folder, update): +def get_tweet(tweetid, dry_run, folder, update): wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) - yield from utils.download_tweet(wq, tweetid, write, folder, update) + yield from utils.download_tweet(wq, tweetid, not dry_run, folder, update) @tweet.command('get_all', help='''Download tweets from a list of tweets in a CSV file. The result is stored as individual json files in your folder of choice.''') @@ -245,26 +250,13 @@ def list_users(ctx, db): @users.command('get') @click.argument('user') -@click.option('-w', '--write', is_flag=True, default=False) +@click.option('-d', '--dry_run', is_flag=True, default=False) @click.option('-f', '--folder', default="users") @click.option('-u', '--update', help="Update the file even if the user exists", is_flag=True, default=False) -def get_user(user, write, folder, update): +@serialize +def get_user(user, dry_run, folder, update): wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) - if not write: - u = utils.get_user(wq, user) - js = json.dumps(u, indent=2) - print(js) - return - if not os.path.exists(folder): - os.makedirs(folder) - file = os.path.join(folder, '%s.json' % user) - if not update and os.path.exists(file) and os.path.isfile(file): - print('User exists: %s' % user) - return - with open(file, 'w') as f: - u = utils.get_user(wq, user) - js = json.dumps(u, indent=2) - print(js, file=f) + yield from utils.download_user(wq, user, not dry_run, folder, update) @users.command('get_all', help='''Download users from a list of user ids/screen names in a CSV file. The result is stored as individual json files in your folder of choice.''') diff --git a/bitter/utils.py b/bitter/utils.py index 6a98c48..8f39470 100644 --- a/bitter/utils.py +++ b/bitter/utils.py @@ -277,8 +277,7 @@ def download_entry(wq, entry_id, dburi=None, recursive=False): download_user(wq, session, user, entry, recursive) session.close() - -def download_user(wq, session, user, entry=None, recursive=False, max_followers=50000): +def crawl_user(wq, session, user, entry=None, recursive=False, max_followers=50000): total_followers = user.followers_count @@ -478,13 +477,23 @@ def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=F tweet = cached_id(tweetid, folder) if update or not tweet: tweet = get_tweet(wq, tweetid) - if write: + if write and update: if tweet: js = json.dumps(tweet) write_json(js, folder) yield tweet +def download_user(wq, userid, write=True, folder="downloaded_users", update=False): + user = cached_id(userid, folder) + if update or not user: + user = get_user(wq, userid) + if write and update: + if user: + write_json(user, folder, aliases=[user['screen_name'], ]) + yield user + + def cached_id(oid, folder): tweet = None file = os.path.join(folder, '%s.json' % oid) @@ -497,7 +506,7 @@ def cached_id(oid, folder): logger.error('Error getting cached version of {}: {}'.format(oid, ex)) return tweet -def write_json(js, folder, oid=None): +def write_json(js, folder, oid=None, aliases=[]): if not oid: oid = js['id'] file = id_file(oid, folder) @@ -506,6 +515,8 @@ def write_json(js, folder, oid=None): with open(file, 'w') as f: json.dump(js, f) logger.info('Written {} to file {}'.format(oid, file)) + for alias in aliases: + os.symlink('%s.json' % oid, id_file(alias, folder)) def id_file(oid, folder): return os.path.join(folder, '%s.json' % oid)