mirror of
https://github.com/balkian/bitter.git
synced 2024-12-22 00:18:12 +00:00
Changes to user and tweet search: Cache by default
* Improved printing of credential limits * Tweet and user searchers cache by default. Write has been changed to dry_run
This commit is contained in:
parent
bba73091e4
commit
030c41b826
@ -1 +1 @@
|
|||||||
0.10.1
|
0.10.2
|
||||||
|
@ -93,6 +93,8 @@ def main(ctx, verbose, logging_level, config, credentials):
|
|||||||
@main.group(invoke_without_command=True)
|
@main.group(invoke_without_command=True)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def credentials(ctx):
|
def credentials(ctx):
|
||||||
|
if ctx.invoked_subcommand is not None:
|
||||||
|
return
|
||||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
for worker in wq.queue:
|
for worker in wq.queue:
|
||||||
print('#'*20)
|
print('#'*20)
|
||||||
@ -104,44 +106,47 @@ def credentials(ctx):
|
|||||||
|
|
||||||
|
|
||||||
@credentials.command('limits')
|
@credentials.command('limits')
|
||||||
@click.option('--all', type=bool, default=False, required=False,
|
@click.option('--no_aggregate', is_flag=True, default=False,
|
||||||
help=('Print all limits. By default, it only limits that '
|
help=('Print limits from all workers. By default, limits are '
|
||||||
|
'aggregated (summed).'))
|
||||||
|
@click.option('--no_diff', is_flag=True, default=False,
|
||||||
|
help=('Print all limits. By default, only limits that '
|
||||||
'have been consumed will be shown.'))
|
'have been consumed will be shown.'))
|
||||||
@click.argument('url', required=False)
|
@click.argument('url', required=False)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def get_limits(ctx, all, url):
|
def get_limits(ctx, no_aggregate, no_diff, url):
|
||||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
total = {}
|
limits = {}
|
||||||
|
if url:
|
||||||
|
print('URL is: {}'.format(url))
|
||||||
for worker in wq.queue:
|
for worker in wq.queue:
|
||||||
resp = worker.client.application.rate_limit_status()
|
resp = worker.client.application.rate_limit_status()
|
||||||
print('#'*20)
|
for urlimits in resp['resources'].values():
|
||||||
print(worker.name)
|
for url, value in urlimits.items():
|
||||||
if url:
|
if url not in limits:
|
||||||
limit = 'NOT FOUND'
|
limits[url] = {}
|
||||||
print('URL is: {}'.format(url))
|
glob = limits[url].get('global', {})
|
||||||
cat = url.split('/')[1]
|
limits[url][worker.name] = value
|
||||||
if cat in resp['resources']:
|
for k in ['limit', 'remaining']:
|
||||||
limit = resp['resources'][cat].get(url, None) or resp['resources'][cat]
|
if k not in glob:
|
||||||
else:
|
glob[k] = 0
|
||||||
print('Cat {} not found'.format(cat))
|
glob[k] += value[k]
|
||||||
continue
|
limits[url]['global'] = glob
|
||||||
for k in limit:
|
for url, lims in limits.items():
|
||||||
total[k] = total.get(k, 0) + limit[k]
|
worker_list = lims.keys() if no_aggregate else ['global', ]
|
||||||
print('{}: {}'.format(url, limit))
|
|
||||||
continue
|
url_printed = False
|
||||||
nres = {}
|
|
||||||
if not all:
|
for worker in worker_list:
|
||||||
for res, urls in resp['resources'].items():
|
vals = lims[worker]
|
||||||
nurls = {}
|
consumed = vals['limit'] - vals['remaining']
|
||||||
for u, limits in urls.items():
|
if no_diff or consumed:
|
||||||
if limits['limit'] != limits['remaining']:
|
if not url_printed:
|
||||||
nurls[u] = limits
|
print(url)
|
||||||
if nurls:
|
url_printed = True
|
||||||
nres[res] = nurls
|
print('\t', worker, ':')
|
||||||
resp = nres
|
print('\t\t', vals)
|
||||||
print(json.dumps(resp, indent=2))
|
|
||||||
if url:
|
|
||||||
print('Total for {}: {}'.format(url, total))
|
|
||||||
|
|
||||||
@credentials.command('add')
|
@credentials.command('add')
|
||||||
@click.option('--consumer_key', default=None)
|
@click.option('--consumer_key', default=None)
|
||||||
@ -169,14 +174,14 @@ def tweet(ctx):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
@tweet.command('get')
|
@tweet.command('get')
|
||||||
@click.option('-w', '--write', is_flag=True, default=False)
|
@click.option('-d', '--dry_run', is_flag=True, default=False)
|
||||||
@click.option('-f', '--folder', default="tweets")
|
@click.option('-f', '--folder', default="tweets")
|
||||||
@click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False)
|
@click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False)
|
||||||
@click.argument('tweetid')
|
@click.argument('tweetid')
|
||||||
@serialize
|
@serialize
|
||||||
def get_tweet(tweetid, write, folder, update):
|
def get_tweet(tweetid, dry_run, folder, update):
|
||||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
yield from utils.download_tweet(wq, tweetid, write, folder, update)
|
yield from utils.download_tweet(wq, tweetid, not dry_run, folder, update)
|
||||||
|
|
||||||
@tweet.command('get_all', help='''Download tweets from a list of tweets in a CSV file.
|
@tweet.command('get_all', help='''Download tweets from a list of tweets in a CSV file.
|
||||||
The result is stored as individual json files in your folder of choice.''')
|
The result is stored as individual json files in your folder of choice.''')
|
||||||
@ -245,26 +250,13 @@ def list_users(ctx, db):
|
|||||||
|
|
||||||
@users.command('get')
|
@users.command('get')
|
||||||
@click.argument('user')
|
@click.argument('user')
|
||||||
@click.option('-w', '--write', is_flag=True, default=False)
|
@click.option('-d', '--dry_run', is_flag=True, default=False)
|
||||||
@click.option('-f', '--folder', default="users")
|
@click.option('-f', '--folder', default="users")
|
||||||
@click.option('-u', '--update', help="Update the file even if the user exists", is_flag=True, default=False)
|
@click.option('-u', '--update', help="Update the file even if the user exists", is_flag=True, default=False)
|
||||||
def get_user(user, write, folder, update):
|
@serialize
|
||||||
|
def get_user(user, dry_run, folder, update):
|
||||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
if not write:
|
yield from utils.download_user(wq, user, not dry_run, folder, update)
|
||||||
u = utils.get_user(wq, user)
|
|
||||||
js = json.dumps(u, indent=2)
|
|
||||||
print(js)
|
|
||||||
return
|
|
||||||
if not os.path.exists(folder):
|
|
||||||
os.makedirs(folder)
|
|
||||||
file = os.path.join(folder, '%s.json' % user)
|
|
||||||
if not update and os.path.exists(file) and os.path.isfile(file):
|
|
||||||
print('User exists: %s' % user)
|
|
||||||
return
|
|
||||||
with open(file, 'w') as f:
|
|
||||||
u = utils.get_user(wq, user)
|
|
||||||
js = json.dumps(u, indent=2)
|
|
||||||
print(js, file=f)
|
|
||||||
|
|
||||||
@users.command('get_all', help='''Download users from a list of user ids/screen names in a CSV file.
|
@users.command('get_all', help='''Download users from a list of user ids/screen names in a CSV file.
|
||||||
The result is stored as individual json files in your folder of choice.''')
|
The result is stored as individual json files in your folder of choice.''')
|
||||||
|
@ -277,8 +277,7 @@ def download_entry(wq, entry_id, dburi=None, recursive=False):
|
|||||||
download_user(wq, session, user, entry, recursive)
|
download_user(wq, session, user, entry, recursive)
|
||||||
session.close()
|
session.close()
|
||||||
|
|
||||||
|
def crawl_user(wq, session, user, entry=None, recursive=False, max_followers=50000):
|
||||||
def download_user(wq, session, user, entry=None, recursive=False, max_followers=50000):
|
|
||||||
|
|
||||||
total_followers = user.followers_count
|
total_followers = user.followers_count
|
||||||
|
|
||||||
@ -478,13 +477,23 @@ def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=F
|
|||||||
tweet = cached_id(tweetid, folder)
|
tweet = cached_id(tweetid, folder)
|
||||||
if update or not tweet:
|
if update or not tweet:
|
||||||
tweet = get_tweet(wq, tweetid)
|
tweet = get_tweet(wq, tweetid)
|
||||||
if write:
|
if write and update:
|
||||||
if tweet:
|
if tweet:
|
||||||
js = json.dumps(tweet)
|
js = json.dumps(tweet)
|
||||||
write_json(js, folder)
|
write_json(js, folder)
|
||||||
yield tweet
|
yield tweet
|
||||||
|
|
||||||
|
|
||||||
|
def download_user(wq, userid, write=True, folder="downloaded_users", update=False):
|
||||||
|
user = cached_id(userid, folder)
|
||||||
|
if update or not user:
|
||||||
|
user = get_user(wq, userid)
|
||||||
|
if write and update:
|
||||||
|
if user:
|
||||||
|
write_json(user, folder, aliases=[user['screen_name'], ])
|
||||||
|
yield user
|
||||||
|
|
||||||
|
|
||||||
def cached_id(oid, folder):
|
def cached_id(oid, folder):
|
||||||
tweet = None
|
tweet = None
|
||||||
file = os.path.join(folder, '%s.json' % oid)
|
file = os.path.join(folder, '%s.json' % oid)
|
||||||
@ -497,7 +506,7 @@ def cached_id(oid, folder):
|
|||||||
logger.error('Error getting cached version of {}: {}'.format(oid, ex))
|
logger.error('Error getting cached version of {}: {}'.format(oid, ex))
|
||||||
return tweet
|
return tweet
|
||||||
|
|
||||||
def write_json(js, folder, oid=None):
|
def write_json(js, folder, oid=None, aliases=[]):
|
||||||
if not oid:
|
if not oid:
|
||||||
oid = js['id']
|
oid = js['id']
|
||||||
file = id_file(oid, folder)
|
file = id_file(oid, folder)
|
||||||
@ -506,6 +515,8 @@ def write_json(js, folder, oid=None):
|
|||||||
with open(file, 'w') as f:
|
with open(file, 'w') as f:
|
||||||
json.dump(js, f)
|
json.dump(js, f)
|
||||||
logger.info('Written {} to file {}'.format(oid, file))
|
logger.info('Written {} to file {}'.format(oid, file))
|
||||||
|
for alias in aliases:
|
||||||
|
os.symlink('%s.json' % oid, id_file(alias, folder))
|
||||||
|
|
||||||
def id_file(oid, folder):
|
def id_file(oid, folder):
|
||||||
return os.path.join(folder, '%s.json' % oid)
|
return os.path.join(folder, '%s.json' % oid)
|
||||||
|
Loading…
Reference in New Issue
Block a user