mirror of
https://github.com/balkian/bitter.git
synced 2024-12-22 08:28:12 +00:00
Improve download_list
This commit is contained in:
parent
02aec5eefa
commit
653487e2d7
@ -1 +1 @@
|
|||||||
0.9.3
|
0.9.5
|
||||||
|
@ -42,10 +42,58 @@ def main(ctx, verbose, logging_level, config, credentials):
|
|||||||
utils.copy_credentials_to_config(credentials, config)
|
utils.copy_credentials_to_config(credentials, config)
|
||||||
|
|
||||||
|
|
||||||
@main.group()
|
@main.group(invoke_without_command=True)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def credentials(ctx):
|
def credentials(ctx):
|
||||||
pass
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
|
for worker in wq.queue:
|
||||||
|
print('#'*20)
|
||||||
|
try:
|
||||||
|
resp = worker.client.application.rate_limit_status()
|
||||||
|
print(worker.name)
|
||||||
|
except Exception as ex:
|
||||||
|
print('{}: AUTHENTICATION ERROR: {}'.format(worker.name, ex) )
|
||||||
|
|
||||||
|
|
||||||
|
@credentials.command('limits')
|
||||||
|
@click.option('--all', type=bool, default=False, required=False,
|
||||||
|
help=('Print all limits. By default, it only limits that '
|
||||||
|
'have been consumed will be shown.'))
|
||||||
|
@click.argument('url', required=False)
|
||||||
|
@click.pass_context
|
||||||
|
def get_limits(ctx, all, url):
|
||||||
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
|
total = {}
|
||||||
|
for worker in wq.queue:
|
||||||
|
resp = worker.client.application.rate_limit_status()
|
||||||
|
print('#'*20)
|
||||||
|
print(worker.name)
|
||||||
|
if url:
|
||||||
|
limit = 'NOT FOUND'
|
||||||
|
print('URL is: {}'.format(url))
|
||||||
|
cat = url.split('/')[1]
|
||||||
|
if cat in resp['resources']:
|
||||||
|
limit = resp['resources'][cat].get(url, None) or resp['resources'][cat]
|
||||||
|
else:
|
||||||
|
print('Cat {} not found'.format(cat))
|
||||||
|
continue
|
||||||
|
for k in limit:
|
||||||
|
total[k] = total.get(k, 0) + limit[k]
|
||||||
|
print('{}: {}'.format(url, limit))
|
||||||
|
continue
|
||||||
|
nres = {}
|
||||||
|
if not all:
|
||||||
|
for res, urls in resp['resources'].items():
|
||||||
|
nurls = {}
|
||||||
|
for u, limits in urls.items():
|
||||||
|
if limits['limit'] != limits['remaining']:
|
||||||
|
nurls[u] = limits
|
||||||
|
if nurls:
|
||||||
|
nres[res] = nurls
|
||||||
|
resp = nres
|
||||||
|
print(json.dumps(resp, indent=2))
|
||||||
|
if url:
|
||||||
|
print('Total for {}: {}'.format(url, total))
|
||||||
|
|
||||||
@credentials.command('add')
|
@credentials.command('add')
|
||||||
@click.option('--consumer_key', default=None)
|
@click.option('--consumer_key', default=None)
|
||||||
@ -98,11 +146,17 @@ def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, header, quotec
|
|||||||
click.echo('Cancelling')
|
click.echo('Cancelling')
|
||||||
return
|
return
|
||||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
|
|
||||||
batch_method=utils.tweet_download_batch,
|
status = tqdm('Queried')
|
||||||
header=header, quotechar=quotechar,
|
failed = 0
|
||||||
column=column, update=update, retry_failed=retry):
|
for tid, obj in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
|
||||||
pass
|
batch_method=utils.tweet_download_batch,
|
||||||
|
header=header, quotechar=quotechar,
|
||||||
|
column=column, update=update, retry_failed=retry):
|
||||||
|
status.update(1)
|
||||||
|
if not obj:
|
||||||
|
failed += 1
|
||||||
|
status.set_description('Failed: %s. Queried' % failed, refresh=True)
|
||||||
|
|
||||||
@tweet.command('search')
|
@tweet.command('search')
|
||||||
@click.argument('query')
|
@click.argument('query')
|
||||||
@ -383,34 +437,6 @@ def reset_extractor(ctx):
|
|||||||
session = make_session(db)
|
session = make_session(db)
|
||||||
session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})
|
session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})
|
||||||
|
|
||||||
@main.command('limits')
|
|
||||||
@click.argument('url', required=False)
|
|
||||||
@click.pass_context
|
|
||||||
def get_limits(ctx, url):
|
|
||||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
|
||||||
total = {}
|
|
||||||
for worker in wq.queue:
|
|
||||||
resp = worker.client.application.rate_limit_status()
|
|
||||||
print('#'*20)
|
|
||||||
print(worker.name)
|
|
||||||
if url:
|
|
||||||
limit = 'NOT FOUND'
|
|
||||||
print('URL is: {}'.format(url))
|
|
||||||
cat = url.split('/')[1]
|
|
||||||
if cat in resp['resources']:
|
|
||||||
limit = resp['resources'][cat].get(url, None) or resp['resources'][cat]
|
|
||||||
else:
|
|
||||||
print('Cat {} not found'.format(cat))
|
|
||||||
continue
|
|
||||||
for k in limit:
|
|
||||||
total[k] = total.get(k, 0) + limit[k]
|
|
||||||
print('{}: {}'.format(url, limit))
|
|
||||||
else:
|
|
||||||
print(json.dumps(resp, indent=2))
|
|
||||||
if url:
|
|
||||||
print('Total for {}: {}'.format(url, total))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False),
|
@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False),
|
||||||
help='''Issue a call to an endpoint of the Twitter API.''')
|
help='''Issue a call to an endpoint of the Twitter API.''')
|
||||||
|
129
bitter/utils.py
129
bitter/utils.py
@ -13,6 +13,11 @@ import sqlalchemy
|
|||||||
import os
|
import os
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
from multiprocessing.pool import ThreadPool
|
from multiprocessing.pool import ThreadPool
|
||||||
|
from multiprocessing import Queue
|
||||||
|
|
||||||
|
import queue
|
||||||
|
import threading
|
||||||
|
from select import select
|
||||||
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
@ -22,6 +27,7 @@ from itertools import islice, chain
|
|||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
from random import choice
|
||||||
|
|
||||||
from builtins import map, filter
|
from builtins import map, filter
|
||||||
|
|
||||||
@ -53,7 +59,7 @@ def chunk(iterable, n):
|
|||||||
def parallel(func, source, chunksize=1, numcpus=multiprocessing.cpu_count()):
|
def parallel(func, source, chunksize=1, numcpus=multiprocessing.cpu_count()):
|
||||||
source = chunk(source, chunksize)
|
source = chunk(source, chunksize)
|
||||||
p = ThreadPool(numcpus*2)
|
p = ThreadPool(numcpus*2)
|
||||||
results = p.imap_unordered(func, source, chunksize=int(1000/numcpus))
|
results = p.imap_unordered(func, source)
|
||||||
for i in chain.from_iterable(results):
|
for i in chain.from_iterable(results):
|
||||||
yield i
|
yield i
|
||||||
|
|
||||||
@ -507,7 +513,8 @@ def id_failed(oid, folder):
|
|||||||
|
|
||||||
def tweet_download_batch(wq, batch):
|
def tweet_download_batch(wq, batch):
|
||||||
tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id']
|
tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id']
|
||||||
return tweets.items()
|
for tid, tweet in tweets.items():
|
||||||
|
yield tid, tweet
|
||||||
|
|
||||||
def user_download_batch(wq, batch):
|
def user_download_batch(wq, batch):
|
||||||
screen_names = []
|
screen_names = []
|
||||||
@ -547,45 +554,81 @@ def user_download_batch(wq, batch):
|
|||||||
yield (name, None)
|
yield (name, None)
|
||||||
|
|
||||||
|
|
||||||
def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=True,
|
def dump_result(oid, obj, folder, ignore_fails=True):
|
||||||
|
if obj:
|
||||||
|
try:
|
||||||
|
write_json(obj, folder=folder, oid=oid)
|
||||||
|
failed = fail_file(oid, folder)
|
||||||
|
if os.path.exists(failed):
|
||||||
|
os.remove(failed)
|
||||||
|
except Exception as ex:
|
||||||
|
logger.error('%s: %s' % (oid, ex))
|
||||||
|
if not ignore_fails:
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
logger.info('Object not recovered: {}'.format(oid))
|
||||||
|
with open(fail_file(oid, folder), 'w') as f:
|
||||||
|
print('Object not found', file=f)
|
||||||
|
|
||||||
|
def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=False,
|
||||||
batch_method=tweet_download_batch):
|
batch_method=tweet_download_batch):
|
||||||
def filter_lines(line):
|
|
||||||
# print('Checking {}'.format(line))
|
|
||||||
oid = line[0]
|
|
||||||
if (cached_id(oid, folder) and not update) or (id_failed(oid, folder) and not retry_failed):
|
|
||||||
yield None
|
|
||||||
else:
|
|
||||||
yield str(oid)
|
|
||||||
|
|
||||||
def print_result(res):
|
done = Queue()
|
||||||
for oid, obj in res:
|
|
||||||
if obj:
|
|
||||||
try:
|
|
||||||
write_json(obj, folder=folder, oid=oid)
|
|
||||||
failed = fail_file(oid, folder)
|
|
||||||
if os.path.exists(failed):
|
|
||||||
os.remove(failed)
|
|
||||||
yield 1
|
|
||||||
except Exception as ex:
|
|
||||||
logger.error('%s: %s' % (oid, ex))
|
|
||||||
if not ignore_fails:
|
|
||||||
raise
|
|
||||||
else:
|
|
||||||
logger.info('Object not recovered: {}'.format(oid))
|
|
||||||
with open(fail_file(oid, folder), 'w') as f:
|
|
||||||
print('Object not found', file=f)
|
|
||||||
yield -1
|
|
||||||
|
|
||||||
objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects'))
|
down = Queue()
|
||||||
batch_method = partial(batch_method, wq)
|
|
||||||
objects = parallel(batch_method, objects_to_crawl, 100)
|
|
||||||
failed = 0
|
def filter_list(lst, done, down):
|
||||||
pbar = tqdm(parallel(print_result, objects), desc='Queried')
|
print('filtering')
|
||||||
for res in pbar:
|
for oid in lst:
|
||||||
if res < 0:
|
# print('Checking {}'.format(line))
|
||||||
failed += 1
|
cached = cached_id(oid, folder)
|
||||||
pbar.set_description('Failed: %s. Queried' % failed, refresh=True)
|
if (cached and not update):
|
||||||
yield res
|
done.put((oid, cached))
|
||||||
|
elif (id_failed(oid, folder) and not retry_failed):
|
||||||
|
done.put((oid, None))
|
||||||
|
else:
|
||||||
|
down.put(oid)
|
||||||
|
down.put(None)
|
||||||
|
|
||||||
|
def download_results(batch_method, down, done):
|
||||||
|
def gen():
|
||||||
|
while True:
|
||||||
|
r = down.get()
|
||||||
|
if not r:
|
||||||
|
return
|
||||||
|
yield r
|
||||||
|
|
||||||
|
for t in parallel(batch_method, gen(), 100):
|
||||||
|
done.put(t)
|
||||||
|
|
||||||
|
def batch(*args, **kwargs):
|
||||||
|
return batch_method(wq, *args, **kwargs)
|
||||||
|
|
||||||
|
tc = threading.Thread(target=filter_list, args=(lst, done, down), daemon=True)
|
||||||
|
tc.start()
|
||||||
|
td = threading.Thread(target=download_results, args=(batch, down, done), daemon=True)
|
||||||
|
td.start()
|
||||||
|
|
||||||
|
def check_threads(ts, done):
|
||||||
|
for t in ts:
|
||||||
|
t.join()
|
||||||
|
done.put(None)
|
||||||
|
|
||||||
|
wait = threading.Thread(target=check_threads, args=([tc, td], done), daemon=True)
|
||||||
|
wait.start()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
rec = done.get()
|
||||||
|
|
||||||
|
if rec is None:
|
||||||
|
break
|
||||||
|
|
||||||
|
oid, obj = rec
|
||||||
|
dump_result(oid, obj, folder, ignore_fails)
|
||||||
|
yield rec
|
||||||
|
|
||||||
|
wait.join()
|
||||||
|
|
||||||
|
|
||||||
def download_file(wq, csvfile, folder, column=0, delimiter=',',
|
def download_file(wq, csvfile, folder, column=0, delimiter=',',
|
||||||
@ -595,8 +638,14 @@ def download_file(wq, csvfile, folder, column=0, delimiter=',',
|
|||||||
csvreader = csv.reader(f, delimiter=str(delimiter), quotechar=str(quotechar))
|
csvreader = csv.reader(f, delimiter=str(delimiter), quotechar=str(quotechar))
|
||||||
if header:
|
if header:
|
||||||
next(csvreader)
|
next(csvreader)
|
||||||
tweets = map(lambda row: row[0].strip(), csvreader)
|
|
||||||
for res in download_list(wq, tweets, folder, batch_method=batch_method,
|
def reader(r):
|
||||||
|
for row in csvreader:
|
||||||
|
if len(row) > column:
|
||||||
|
yield row[column].strip()
|
||||||
|
|
||||||
|
|
||||||
|
for res in download_list(wq, reader(csvreader), folder, batch_method=batch_method,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
yield res
|
yield res
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user