1
0
mirror of https://github.com/balkian/bitter.git synced 2024-12-22 08:28:12 +00:00

Fix bug user_ids

Add number of failed downloads to the output.
Add flag to retry previously failed downloads.
This commit is contained in:
J. Fernando Sánchez 2018-09-16 12:20:41 +02:00
parent e6b08c4ffb
commit 02aec5eefa
3 changed files with 31 additions and 11 deletions

View File

@ -1 +1 @@
0.9.2 0.9.3

View File

@ -85,14 +85,15 @@ def get_tweet(tweetid, write, folder, update):
The result is stored as individual json files in your folder of choice.''') The result is stored as individual json files in your folder of choice.''')
@click.argument('tweetsfile', 'File with a list of tweets to look up') @click.argument('tweetsfile', 'File with a list of tweets to look up')
@click.option('-f', '--folder', default="tweets") @click.option('-f', '--folder', default="tweets")
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!') @click.option('-u', '--update', is_flag=True, default=False, help='Download tweet even if it is already present. WARNING: it will overwrite existing files!')
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
@click.option('-d', '--delimiter', default=",") @click.option('-d', '--delimiter', default=",")
@click.option('-h', '--header', help='Discard the first line (use it as a header)', @click.option('-h', '--header', help='Discard the first line (use it as a header)',
is_flag=True, default=False) is_flag=True, default=False)
@click.option('-q', '--quotechar', default='"') @click.option('-q', '--quotechar', default='"')
@click.option('-c', '--column', type=int, default=0) @click.option('-c', '--column', type=int, default=0)
@click.pass_context @click.pass_context
def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, column): def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, header, quotechar, column):
if update and not click.confirm('This may overwrite existing tweets. Continue?'): if update and not click.confirm('This may overwrite existing tweets. Continue?'):
click.echo('Cancelling') click.echo('Cancelling')
return return
@ -100,7 +101,7 @@ def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, co
for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter, for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
batch_method=utils.tweet_download_batch, batch_method=utils.tweet_download_batch,
header=header, quotechar=quotechar, header=header, quotechar=quotechar,
column=column, update=update): column=column, update=update, retry_failed=retry):
pass pass
@tweet.command('search') @tweet.command('search')
@ -163,20 +164,21 @@ def get_user(user, write, folder, update):
@click.argument('usersfile', 'File with a list of users to look up') @click.argument('usersfile', 'File with a list of users to look up')
@click.option('-f', '--folder', default="users") @click.option('-f', '--folder', default="users")
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!') @click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
@click.option('-d', '--delimiter', default=",") @click.option('-d', '--delimiter', default=",")
@click.option('-h', '--header', help='Discard the first line (use it as a header)', @click.option('-h', '--header', help='Discard the first line (use it as a header)',
is_flag=True, default=False) is_flag=True, default=False)
@click.option('-q', '--quotechar', default='"') @click.option('-q', '--quotechar', default='"')
@click.option('-c', '--column', type=int, default=0) @click.option('-c', '--column', type=int, default=0)
@click.pass_context @click.pass_context
def get_users(ctx, usersfile, folder, update, delimiter, header, quotechar, column): def get_users(ctx, usersfile, folder, update, retry, delimiter, header, quotechar, column):
if update and not click.confirm('This may overwrite existing users. Continue?'): if update and not click.confirm('This may overwrite existing users. Continue?'):
click.echo('Cancelling') click.echo('Cancelling')
return return
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter, for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter,
batch_method=utils.user_download_batch, batch_method=utils.user_download_batch,
update=update, update=update, retry_failed=retry,
header=header, quotechar=quotechar, header=header, quotechar=quotechar,
column=column): column=column):
pass pass

View File

@ -518,12 +518,22 @@ def user_download_batch(wq, batch):
user_ids.append(str(elem)) user_ids.append(str(elem))
except ValueError: except ValueError:
screen_names.append(elem.lower()) screen_names.append(elem.lower())
print('Downloading: {} - {}'.format(user_ids, screen_names)) args = {}
users = wq.users.lookup(user_id=",".join(user_ids), screen_name=",".join(screen_names)) if user_ids:
args['user_id'] = ','.join(user_ids)
if screen_names:
args['screen_name'] = ','.join(screen_names)
try:
users = wq.users.lookup(**args)
except TwitterHTTPError as ex:
if ex.e.code in (404,):
users = []
else:
raise
found_ids = [] found_ids = []
found_names = [] found_names = []
for user in users: for user in users:
uid = user['id'] uid = user['id_str']
if uid in user_ids: if uid in user_ids:
found_ids.append(uid) found_ids.append(uid)
yield (uid, user) yield (uid, user)
@ -552,6 +562,9 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail
if obj: if obj:
try: try:
write_json(obj, folder=folder, oid=oid) write_json(obj, folder=folder, oid=oid)
failed = fail_file(oid, folder)
if os.path.exists(failed):
os.remove(failed)
yield 1 yield 1
except Exception as ex: except Exception as ex:
logger.error('%s: %s' % (oid, ex)) logger.error('%s: %s' % (oid, ex))
@ -565,8 +578,13 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail
objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects')) objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects'))
batch_method = partial(batch_method, wq) batch_method = partial(batch_method, wq)
tweets = parallel(batch_method, objects_to_crawl, 100) objects = parallel(batch_method, objects_to_crawl, 100)
for res in tqdm(parallel(print_result, tweets), desc='Queried'): failed = 0
pbar = tqdm(parallel(print_result, objects), desc='Queried')
for res in pbar:
if res < 0:
failed += 1
pbar.set_description('Failed: %s. Queried' % failed, refresh=True)
yield res yield res