mirror of
https://github.com/balkian/bitter.git
synced 2024-12-22 08:28:12 +00:00
Fix bug user_ids
Add number of failed downloads to the output. Add flag to retry previously failed downloads.
This commit is contained in:
parent
e6b08c4ffb
commit
02aec5eefa
@ -1 +1 @@
|
||||
0.9.2
|
||||
0.9.3
|
||||
|
@ -85,14 +85,15 @@ def get_tweet(tweetid, write, folder, update):
|
||||
The result is stored as individual json files in your folder of choice.''')
|
||||
@click.argument('tweetsfile', 'File with a list of tweets to look up')
|
||||
@click.option('-f', '--folder', default="tweets")
|
||||
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
|
||||
@click.option('-u', '--update', is_flag=True, default=False, help='Download tweet even if it is already present. WARNING: it will overwrite existing files!')
|
||||
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
|
||||
@click.option('-d', '--delimiter', default=",")
|
||||
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
|
||||
is_flag=True, default=False)
|
||||
@click.option('-q', '--quotechar', default='"')
|
||||
@click.option('-c', '--column', type=int, default=0)
|
||||
@click.pass_context
|
||||
def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, column):
|
||||
def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, header, quotechar, column):
|
||||
if update and not click.confirm('This may overwrite existing tweets. Continue?'):
|
||||
click.echo('Cancelling')
|
||||
return
|
||||
@ -100,7 +101,7 @@ def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, co
|
||||
for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
|
||||
batch_method=utils.tweet_download_batch,
|
||||
header=header, quotechar=quotechar,
|
||||
column=column, update=update):
|
||||
column=column, update=update, retry_failed=retry):
|
||||
pass
|
||||
|
||||
@tweet.command('search')
|
||||
@ -163,20 +164,21 @@ def get_user(user, write, folder, update):
|
||||
@click.argument('usersfile', 'File with a list of users to look up')
|
||||
@click.option('-f', '--folder', default="users")
|
||||
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
|
||||
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
|
||||
@click.option('-d', '--delimiter', default=",")
|
||||
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
|
||||
is_flag=True, default=False)
|
||||
@click.option('-q', '--quotechar', default='"')
|
||||
@click.option('-c', '--column', type=int, default=0)
|
||||
@click.pass_context
|
||||
def get_users(ctx, usersfile, folder, update, delimiter, header, quotechar, column):
|
||||
def get_users(ctx, usersfile, folder, update, retry, delimiter, header, quotechar, column):
|
||||
if update and not click.confirm('This may overwrite existing users. Continue?'):
|
||||
click.echo('Cancelling')
|
||||
return
|
||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||
for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter,
|
||||
batch_method=utils.user_download_batch,
|
||||
update=update,
|
||||
update=update, retry_failed=retry,
|
||||
header=header, quotechar=quotechar,
|
||||
column=column):
|
||||
pass
|
||||
|
@ -518,12 +518,22 @@ def user_download_batch(wq, batch):
|
||||
user_ids.append(str(elem))
|
||||
except ValueError:
|
||||
screen_names.append(elem.lower())
|
||||
print('Downloading: {} - {}'.format(user_ids, screen_names))
|
||||
users = wq.users.lookup(user_id=",".join(user_ids), screen_name=",".join(screen_names))
|
||||
args = {}
|
||||
if user_ids:
|
||||
args['user_id'] = ','.join(user_ids)
|
||||
if screen_names:
|
||||
args['screen_name'] = ','.join(screen_names)
|
||||
try:
|
||||
users = wq.users.lookup(**args)
|
||||
except TwitterHTTPError as ex:
|
||||
if ex.e.code in (404,):
|
||||
users = []
|
||||
else:
|
||||
raise
|
||||
found_ids = []
|
||||
found_names = []
|
||||
for user in users:
|
||||
uid = user['id']
|
||||
uid = user['id_str']
|
||||
if uid in user_ids:
|
||||
found_ids.append(uid)
|
||||
yield (uid, user)
|
||||
@ -552,6 +562,9 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail
|
||||
if obj:
|
||||
try:
|
||||
write_json(obj, folder=folder, oid=oid)
|
||||
failed = fail_file(oid, folder)
|
||||
if os.path.exists(failed):
|
||||
os.remove(failed)
|
||||
yield 1
|
||||
except Exception as ex:
|
||||
logger.error('%s: %s' % (oid, ex))
|
||||
@ -565,8 +578,13 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail
|
||||
|
||||
objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects'))
|
||||
batch_method = partial(batch_method, wq)
|
||||
tweets = parallel(batch_method, objects_to_crawl, 100)
|
||||
for res in tqdm(parallel(print_result, tweets), desc='Queried'):
|
||||
objects = parallel(batch_method, objects_to_crawl, 100)
|
||||
failed = 0
|
||||
pbar = tqdm(parallel(print_result, objects), desc='Queried')
|
||||
for res in pbar:
|
||||
if res < 0:
|
||||
failed += 1
|
||||
pbar.set_description('Failed: %s. Queried' % failed, refresh=True)
|
||||
yield res
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user