Fix bug user_ids

Add number of failed downloads to the output.
Add flag to retry previously failed downloads.
master
J. Fernando Sánchez 6 years ago
parent e6b08c4ffb
commit 02aec5eefa

@ -1 +1 @@
0.9.2
0.9.3

@ -85,14 +85,15 @@ def get_tweet(tweetid, write, folder, update):
The result is stored as individual json files in your folder of choice.''')
@click.argument('tweetsfile', 'File with a list of tweets to look up')
@click.option('-f', '--folder', default="tweets")
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
@click.option('-u', '--update', is_flag=True, default=False, help='Download tweet even if it is already present. WARNING: it will overwrite existing files!')
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
@click.option('-d', '--delimiter', default=",")
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
is_flag=True, default=False)
@click.option('-q', '--quotechar', default='"')
@click.option('-c', '--column', type=int, default=0)
@click.pass_context
def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, column):
def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, header, quotechar, column):
if update and not click.confirm('This may overwrite existing tweets. Continue?'):
click.echo('Cancelling')
return
@ -100,7 +101,7 @@ def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, co
for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
batch_method=utils.tweet_download_batch,
header=header, quotechar=quotechar,
column=column, update=update):
column=column, update=update, retry_failed=retry):
pass
@tweet.command('search')
@ -163,20 +164,21 @@ def get_user(user, write, folder, update):
@click.argument('usersfile', 'File with a list of users to look up')
@click.option('-f', '--folder', default="users")
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
@click.option('-d', '--delimiter', default=",")
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
is_flag=True, default=False)
@click.option('-q', '--quotechar', default='"')
@click.option('-c', '--column', type=int, default=0)
@click.pass_context
def get_users(ctx, usersfile, folder, update, delimiter, header, quotechar, column):
def get_users(ctx, usersfile, folder, update, retry, delimiter, header, quotechar, column):
if update and not click.confirm('This may overwrite existing users. Continue?'):
click.echo('Cancelling')
return
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter,
batch_method=utils.user_download_batch,
update=update,
update=update, retry_failed=retry,
header=header, quotechar=quotechar,
column=column):
pass

@ -518,12 +518,22 @@ def user_download_batch(wq, batch):
user_ids.append(str(elem))
except ValueError:
screen_names.append(elem.lower())
print('Downloading: {} - {}'.format(user_ids, screen_names))
users = wq.users.lookup(user_id=",".join(user_ids), screen_name=",".join(screen_names))
args = {}
if user_ids:
args['user_id'] = ','.join(user_ids)
if screen_names:
args['screen_name'] = ','.join(screen_names)
try:
users = wq.users.lookup(**args)
except TwitterHTTPError as ex:
if ex.e.code in (404,):
users = []
else:
raise
found_ids = []
found_names = []
for user in users:
uid = user['id']
uid = user['id_str']
if uid in user_ids:
found_ids.append(uid)
yield (uid, user)
@ -552,6 +562,9 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail
if obj:
try:
write_json(obj, folder=folder, oid=oid)
failed = fail_file(oid, folder)
if os.path.exists(failed):
os.remove(failed)
yield 1
except Exception as ex:
logger.error('%s: %s' % (oid, ex))
@ -565,8 +578,13 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail
objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects'))
batch_method = partial(batch_method, wq)
tweets = parallel(batch_method, objects_to_crawl, 100)
for res in tqdm(parallel(print_result, tweets), desc='Queried'):
objects = parallel(batch_method, objects_to_crawl, 100)
failed = 0
pbar = tqdm(parallel(print_result, objects), desc='Queried')
for res in pbar:
if res < 0:
failed += 1
pbar.set_description('Failed: %s. Queried' % failed, refresh=True)
yield res

Loading…
Cancel
Save