1
0
mirror of https://github.com/balkian/bitter.git synced 2024-12-22 08:28:12 +00:00

Changes to user and tweet search: Cache by default

* Improved printing of credential limits
* Tweet and user searchers cache by default. Write has been changed to dry_run
This commit is contained in:
J. Fernando Sánchez 2020-01-07 20:35:29 +01:00
parent bba73091e4
commit 030c41b826
3 changed files with 60 additions and 57 deletions

View File

@ -1 +1 @@
0.10.1 0.10.2

View File

@ -93,6 +93,8 @@ def main(ctx, verbose, logging_level, config, credentials):
@main.group(invoke_without_command=True) @main.group(invoke_without_command=True)
@click.pass_context @click.pass_context
def credentials(ctx): def credentials(ctx):
if ctx.invoked_subcommand is not None:
return
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
for worker in wq.queue: for worker in wq.queue:
print('#'*20) print('#'*20)
@ -104,44 +106,47 @@ def credentials(ctx):
@credentials.command('limits') @credentials.command('limits')
@click.option('--all', type=bool, default=False, required=False, @click.option('--no_aggregate', is_flag=True, default=False,
help=('Print all limits. By default, it only limits that ' help=('Print limits from all workers. By default, limits are '
'aggregated (summed).'))
@click.option('--no_diff', is_flag=True, default=False,
help=('Print all limits. By default, only limits that '
'have been consumed will be shown.')) 'have been consumed will be shown.'))
@click.argument('url', required=False) @click.argument('url', required=False)
@click.pass_context @click.pass_context
def get_limits(ctx, all, url): def get_limits(ctx, no_aggregate, no_diff, url):
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
total = {} limits = {}
if url:
print('URL is: {}'.format(url))
for worker in wq.queue: for worker in wq.queue:
resp = worker.client.application.rate_limit_status() resp = worker.client.application.rate_limit_status()
print('#'*20) for urlimits in resp['resources'].values():
print(worker.name) for url, value in urlimits.items():
if url: if url not in limits:
limit = 'NOT FOUND' limits[url] = {}
print('URL is: {}'.format(url)) glob = limits[url].get('global', {})
cat = url.split('/')[1] limits[url][worker.name] = value
if cat in resp['resources']: for k in ['limit', 'remaining']:
limit = resp['resources'][cat].get(url, None) or resp['resources'][cat] if k not in glob:
else: glob[k] = 0
print('Cat {} not found'.format(cat)) glob[k] += value[k]
continue limits[url]['global'] = glob
for k in limit: for url, lims in limits.items():
total[k] = total.get(k, 0) + limit[k] worker_list = lims.keys() if no_aggregate else ['global', ]
print('{}: {}'.format(url, limit))
continue url_printed = False
nres = {}
if not all: for worker in worker_list:
for res, urls in resp['resources'].items(): vals = lims[worker]
nurls = {} consumed = vals['limit'] - vals['remaining']
for u, limits in urls.items(): if no_diff or consumed:
if limits['limit'] != limits['remaining']: if not url_printed:
nurls[u] = limits print(url)
if nurls: url_printed = True
nres[res] = nurls print('\t', worker, ':')
resp = nres print('\t\t', vals)
print(json.dumps(resp, indent=2))
if url:
print('Total for {}: {}'.format(url, total))
@credentials.command('add') @credentials.command('add')
@click.option('--consumer_key', default=None) @click.option('--consumer_key', default=None)
@ -169,14 +174,14 @@ def tweet(ctx):
pass pass
@tweet.command('get') @tweet.command('get')
@click.option('-w', '--write', is_flag=True, default=False) @click.option('-d', '--dry_run', is_flag=True, default=False)
@click.option('-f', '--folder', default="tweets") @click.option('-f', '--folder', default="tweets")
@click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False) @click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False)
@click.argument('tweetid') @click.argument('tweetid')
@serialize @serialize
def get_tweet(tweetid, write, folder, update): def get_tweet(tweetid, dry_run, folder, update):
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
yield from utils.download_tweet(wq, tweetid, write, folder, update) yield from utils.download_tweet(wq, tweetid, not dry_run, folder, update)
@tweet.command('get_all', help='''Download tweets from a list of tweets in a CSV file. @tweet.command('get_all', help='''Download tweets from a list of tweets in a CSV file.
The result is stored as individual json files in your folder of choice.''') The result is stored as individual json files in your folder of choice.''')
@ -245,26 +250,13 @@ def list_users(ctx, db):
@users.command('get') @users.command('get')
@click.argument('user') @click.argument('user')
@click.option('-w', '--write', is_flag=True, default=False) @click.option('-d', '--dry_run', is_flag=True, default=False)
@click.option('-f', '--folder', default="users") @click.option('-f', '--folder', default="users")
@click.option('-u', '--update', help="Update the file even if the user exists", is_flag=True, default=False) @click.option('-u', '--update', help="Update the file even if the user exists", is_flag=True, default=False)
def get_user(user, write, folder, update): @serialize
def get_user(user, dry_run, folder, update):
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
if not write: yield from utils.download_user(wq, user, not dry_run, folder, update)
u = utils.get_user(wq, user)
js = json.dumps(u, indent=2)
print(js)
return
if not os.path.exists(folder):
os.makedirs(folder)
file = os.path.join(folder, '%s.json' % user)
if not update and os.path.exists(file) and os.path.isfile(file):
print('User exists: %s' % user)
return
with open(file, 'w') as f:
u = utils.get_user(wq, user)
js = json.dumps(u, indent=2)
print(js, file=f)
@users.command('get_all', help='''Download users from a list of user ids/screen names in a CSV file. @users.command('get_all', help='''Download users from a list of user ids/screen names in a CSV file.
The result is stored as individual json files in your folder of choice.''') The result is stored as individual json files in your folder of choice.''')

View File

@ -277,8 +277,7 @@ def download_entry(wq, entry_id, dburi=None, recursive=False):
download_user(wq, session, user, entry, recursive) download_user(wq, session, user, entry, recursive)
session.close() session.close()
def crawl_user(wq, session, user, entry=None, recursive=False, max_followers=50000):
def download_user(wq, session, user, entry=None, recursive=False, max_followers=50000):
total_followers = user.followers_count total_followers = user.followers_count
@ -478,13 +477,23 @@ def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=F
tweet = cached_id(tweetid, folder) tweet = cached_id(tweetid, folder)
if update or not tweet: if update or not tweet:
tweet = get_tweet(wq, tweetid) tweet = get_tweet(wq, tweetid)
if write: if write and update:
if tweet: if tweet:
js = json.dumps(tweet) js = json.dumps(tweet)
write_json(js, folder) write_json(js, folder)
yield tweet yield tweet
def download_user(wq, userid, write=True, folder="downloaded_users", update=False):
user = cached_id(userid, folder)
if update or not user:
user = get_user(wq, userid)
if write and update:
if user:
write_json(user, folder, aliases=[user['screen_name'], ])
yield user
def cached_id(oid, folder): def cached_id(oid, folder):
tweet = None tweet = None
file = os.path.join(folder, '%s.json' % oid) file = os.path.join(folder, '%s.json' % oid)
@ -497,7 +506,7 @@ def cached_id(oid, folder):
logger.error('Error getting cached version of {}: {}'.format(oid, ex)) logger.error('Error getting cached version of {}: {}'.format(oid, ex))
return tweet return tweet
def write_json(js, folder, oid=None): def write_json(js, folder, oid=None, aliases=[]):
if not oid: if not oid:
oid = js['id'] oid = js['id']
file = id_file(oid, folder) file = id_file(oid, folder)
@ -506,6 +515,8 @@ def write_json(js, folder, oid=None):
with open(file, 'w') as f: with open(file, 'w') as f:
json.dump(js, f) json.dump(js, f)
logger.info('Written {} to file {}'.format(oid, file)) logger.info('Written {} to file {}'.format(oid, file))
for alias in aliases:
os.symlink('%s.json' % oid, id_file(alias, folder))
def id_file(oid, folder): def id_file(oid, folder):
return os.path.join(folder, '%s.json' % oid) return os.path.join(folder, '%s.json' % oid)