mirror of
https://github.com/balkian/bitter.git
synced 2024-12-22 00:18:12 +00:00
Fix bug user_ids
Add number of failed downloads to the output. Add flag to retry previously failed downloads.
This commit is contained in:
parent
e6b08c4ffb
commit
02aec5eefa
@ -1 +1 @@
|
|||||||
0.9.2
|
0.9.3
|
||||||
|
@ -85,14 +85,15 @@ def get_tweet(tweetid, write, folder, update):
|
|||||||
The result is stored as individual json files in your folder of choice.''')
|
The result is stored as individual json files in your folder of choice.''')
|
||||||
@click.argument('tweetsfile', 'File with a list of tweets to look up')
|
@click.argument('tweetsfile', 'File with a list of tweets to look up')
|
||||||
@click.option('-f', '--folder', default="tweets")
|
@click.option('-f', '--folder', default="tweets")
|
||||||
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
|
@click.option('-u', '--update', is_flag=True, default=False, help='Download tweet even if it is already present. WARNING: it will overwrite existing files!')
|
||||||
|
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
|
||||||
@click.option('-d', '--delimiter', default=",")
|
@click.option('-d', '--delimiter', default=",")
|
||||||
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
|
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
|
||||||
is_flag=True, default=False)
|
is_flag=True, default=False)
|
||||||
@click.option('-q', '--quotechar', default='"')
|
@click.option('-q', '--quotechar', default='"')
|
||||||
@click.option('-c', '--column', type=int, default=0)
|
@click.option('-c', '--column', type=int, default=0)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, column):
|
def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, header, quotechar, column):
|
||||||
if update and not click.confirm('This may overwrite existing tweets. Continue?'):
|
if update and not click.confirm('This may overwrite existing tweets. Continue?'):
|
||||||
click.echo('Cancelling')
|
click.echo('Cancelling')
|
||||||
return
|
return
|
||||||
@ -100,7 +101,7 @@ def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, co
|
|||||||
for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
|
for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
|
||||||
batch_method=utils.tweet_download_batch,
|
batch_method=utils.tweet_download_batch,
|
||||||
header=header, quotechar=quotechar,
|
header=header, quotechar=quotechar,
|
||||||
column=column, update=update):
|
column=column, update=update, retry_failed=retry):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@tweet.command('search')
|
@tweet.command('search')
|
||||||
@ -163,20 +164,21 @@ def get_user(user, write, folder, update):
|
|||||||
@click.argument('usersfile', 'File with a list of users to look up')
|
@click.argument('usersfile', 'File with a list of users to look up')
|
||||||
@click.option('-f', '--folder', default="users")
|
@click.option('-f', '--folder', default="users")
|
||||||
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
|
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
|
||||||
|
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
|
||||||
@click.option('-d', '--delimiter', default=",")
|
@click.option('-d', '--delimiter', default=",")
|
||||||
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
|
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
|
||||||
is_flag=True, default=False)
|
is_flag=True, default=False)
|
||||||
@click.option('-q', '--quotechar', default='"')
|
@click.option('-q', '--quotechar', default='"')
|
||||||
@click.option('-c', '--column', type=int, default=0)
|
@click.option('-c', '--column', type=int, default=0)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def get_users(ctx, usersfile, folder, update, delimiter, header, quotechar, column):
|
def get_users(ctx, usersfile, folder, update, retry, delimiter, header, quotechar, column):
|
||||||
if update and not click.confirm('This may overwrite existing users. Continue?'):
|
if update and not click.confirm('This may overwrite existing users. Continue?'):
|
||||||
click.echo('Cancelling')
|
click.echo('Cancelling')
|
||||||
return
|
return
|
||||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter,
|
for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter,
|
||||||
batch_method=utils.user_download_batch,
|
batch_method=utils.user_download_batch,
|
||||||
update=update,
|
update=update, retry_failed=retry,
|
||||||
header=header, quotechar=quotechar,
|
header=header, quotechar=quotechar,
|
||||||
column=column):
|
column=column):
|
||||||
pass
|
pass
|
||||||
|
@ -518,12 +518,22 @@ def user_download_batch(wq, batch):
|
|||||||
user_ids.append(str(elem))
|
user_ids.append(str(elem))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
screen_names.append(elem.lower())
|
screen_names.append(elem.lower())
|
||||||
print('Downloading: {} - {}'.format(user_ids, screen_names))
|
args = {}
|
||||||
users = wq.users.lookup(user_id=",".join(user_ids), screen_name=",".join(screen_names))
|
if user_ids:
|
||||||
|
args['user_id'] = ','.join(user_ids)
|
||||||
|
if screen_names:
|
||||||
|
args['screen_name'] = ','.join(screen_names)
|
||||||
|
try:
|
||||||
|
users = wq.users.lookup(**args)
|
||||||
|
except TwitterHTTPError as ex:
|
||||||
|
if ex.e.code in (404,):
|
||||||
|
users = []
|
||||||
|
else:
|
||||||
|
raise
|
||||||
found_ids = []
|
found_ids = []
|
||||||
found_names = []
|
found_names = []
|
||||||
for user in users:
|
for user in users:
|
||||||
uid = user['id']
|
uid = user['id_str']
|
||||||
if uid in user_ids:
|
if uid in user_ids:
|
||||||
found_ids.append(uid)
|
found_ids.append(uid)
|
||||||
yield (uid, user)
|
yield (uid, user)
|
||||||
@ -552,6 +562,9 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail
|
|||||||
if obj:
|
if obj:
|
||||||
try:
|
try:
|
||||||
write_json(obj, folder=folder, oid=oid)
|
write_json(obj, folder=folder, oid=oid)
|
||||||
|
failed = fail_file(oid, folder)
|
||||||
|
if os.path.exists(failed):
|
||||||
|
os.remove(failed)
|
||||||
yield 1
|
yield 1
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
logger.error('%s: %s' % (oid, ex))
|
logger.error('%s: %s' % (oid, ex))
|
||||||
@ -565,8 +578,13 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail
|
|||||||
|
|
||||||
objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects'))
|
objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects'))
|
||||||
batch_method = partial(batch_method, wq)
|
batch_method = partial(batch_method, wq)
|
||||||
tweets = parallel(batch_method, objects_to_crawl, 100)
|
objects = parallel(batch_method, objects_to_crawl, 100)
|
||||||
for res in tqdm(parallel(print_result, tweets), desc='Queried'):
|
failed = 0
|
||||||
|
pbar = tqdm(parallel(print_result, objects), desc='Queried')
|
||||||
|
for res in pbar:
|
||||||
|
if res < 0:
|
||||||
|
failed += 1
|
||||||
|
pbar.set_description('Failed: %s. Queried' % failed, refresh=True)
|
||||||
yield res
|
yield res
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user