diff --git a/bitter/VERSION b/bitter/VERSION index 2003b63..965065d 100644 --- a/bitter/VERSION +++ b/bitter/VERSION @@ -1 +1 @@ -0.9.2 +0.9.3 diff --git a/bitter/cli.py b/bitter/cli.py index 493bcb9..49fb35f 100644 --- a/bitter/cli.py +++ b/bitter/cli.py @@ -85,14 +85,15 @@ def get_tweet(tweetid, write, folder, update): The result is stored as individual json files in your folder of choice.''') @click.argument('tweetsfile', 'File with a list of tweets to look up') @click.option('-f', '--folder', default="tweets") -@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!') +@click.option('-u', '--update', is_flag=True, default=False, help='Download tweet even if it is already present. WARNING: it will overwrite existing files!') +@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads') @click.option('-d', '--delimiter', default=",") @click.option('-h', '--header', help='Discard the first line (use it as a header)', is_flag=True, default=False) @click.option('-q', '--quotechar', default='"') @click.option('-c', '--column', type=int, default=0) @click.pass_context -def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, column): +def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, header, quotechar, column): if update and not click.confirm('This may overwrite existing tweets. Continue?'): click.echo('Cancelling') return @@ -100,7 +101,7 @@ def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, co for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter, batch_method=utils.tweet_download_batch, header=header, quotechar=quotechar, - column=column, update=update): + column=column, update=update, retry_failed=retry): pass @tweet.command('search') @@ -163,20 +164,21 @@ def get_user(user, write, folder, update): @click.argument('usersfile', 'File with a list of users to look up') @click.option('-f', '--folder', default="users") @click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!') +@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads') @click.option('-d', '--delimiter', default=",") @click.option('-h', '--header', help='Discard the first line (use it as a header)', is_flag=True, default=False) @click.option('-q', '--quotechar', default='"') @click.option('-c', '--column', type=int, default=0) @click.pass_context -def get_users(ctx, usersfile, folder, update, delimiter, header, quotechar, column): +def get_users(ctx, usersfile, folder, update, retry, delimiter, header, quotechar, column): if update and not click.confirm('This may overwrite existing users. Continue?'): click.echo('Cancelling') return wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter, batch_method=utils.user_download_batch, - update=update, + update=update, retry_failed=retry, header=header, quotechar=quotechar, column=column): pass diff --git a/bitter/utils.py b/bitter/utils.py index 098296f..13eabe8 100644 --- a/bitter/utils.py +++ b/bitter/utils.py @@ -518,12 +518,22 @@ def user_download_batch(wq, batch): user_ids.append(str(elem)) except ValueError: screen_names.append(elem.lower()) - print('Downloading: {} - {}'.format(user_ids, screen_names)) - users = wq.users.lookup(user_id=",".join(user_ids), screen_name=",".join(screen_names)) + args = {} + if user_ids: + args['user_id'] = ','.join(user_ids) + if screen_names: + args['screen_name'] = ','.join(screen_names) + try: + users = wq.users.lookup(**args) + except TwitterHTTPError as ex: + if ex.e.code in (404,): + users = [] + else: + raise found_ids = [] found_names = [] for user in users: - uid = user['id'] + uid = user['id_str'] if uid in user_ids: found_ids.append(uid) yield (uid, user) @@ -552,6 +562,9 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail if obj: try: write_json(obj, folder=folder, oid=oid) + failed = fail_file(oid, folder) + if os.path.exists(failed): + os.remove(failed) yield 1 except Exception as ex: logger.error('%s: %s' % (oid, ex)) @@ -565,8 +578,13 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects')) batch_method = partial(batch_method, wq) - tweets = parallel(batch_method, objects_to_crawl, 100) - for res in tqdm(parallel(print_result, tweets), desc='Queried'): + objects = parallel(batch_method, objects_to_crawl, 100) + failed = 0 + pbar = tqdm(parallel(print_result, objects), desc='Queried') + for res in pbar: + if res < 0: + failed += 1 + pbar.set_description('Failed: %s. Queried' % failed, refresh=True) yield res