Improve download_list

2025-11-23 01:48:16 +00:00 · 2019-04-30 19:15:15 +02:00
parent 02aec5eefa
commit 653487e2d7
4 changed files with 164 additions and 89 deletions
--- a/bitter/VERSION
+++ b/bitter/VERSION
@@ -1 +1 @@
-0.9.3
+0.9.5
--- a/bitter/cli.py
+++ b/bitter/cli.py
@@ -42,10 +42,58 @@ def main(ctx, verbose, logging_level, config, credentials):
      utils.copy_credentials_to_config(credentials, config)
-@main.group()
+@main.group(invoke_without_command=True)
@click.pass_context
 def credentials(ctx):
-    pass
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
    for worker in wq.queue:
        print('#'*20)
        try:
            resp = worker.client.application.rate_limit_status()
            print(worker.name)
        except Exception as ex:
            print('{}: AUTHENTICATION ERROR: {}'.format(worker.name, ex) )
@credentials.command('limits')
@click.option('--all', type=bool, default=False, required=False,
              help=('Print all limits. By default, it only limits that '
                    'have been consumed will be shown.'))
@click.argument('url', required=False)
@click.pass_context
 def get_limits(ctx, all, url):
    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
    total = {}
    for worker in wq.queue:
        resp = worker.client.application.rate_limit_status()
        print('#'*20)
        print(worker.name)
        if url:
            limit = 'NOT FOUND'
            print('URL is: {}'.format(url))
            cat = url.split('/')[1]
            if cat in resp['resources']:
                limit = resp['resources'][cat].get(url, None) or resp['resources'][cat]
            else:
                print('Cat {} not found'.format(cat))
                continue
            for k in limit:
                total[k] = total.get(k, 0) + limit[k]
            print('{}: {}'.format(url, limit))
            continue
        nres = {}
        if not all:
            for res, urls in resp['resources'].items():
                nurls = {}
                for u, limits in urls.items():
                    if limits['limit'] != limits['remaining']:
                        nurls[u] = limits
                if nurls:
                    nres[res] = nurls
            resp = nres
        print(json.dumps(resp, indent=2))
    if url:
        print('Total for {}: {}'.format(url, total))
@credentials.command('add')
@click.option('--consumer_key', default=None)
@@ -98,11 +146,17 @@ def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, header, quotec
        click.echo('Cancelling')
        return
    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
-    for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
+
-                                 batch_method=utils.tweet_download_batch,
+    status = tqdm('Queried')
-                                 header=header, quotechar=quotechar,
+    failed = 0
-                                 column=column, update=update, retry_failed=retry):
+    for tid, obj in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
-        pass
+                                        batch_method=utils.tweet_download_batch,
                                        header=header, quotechar=quotechar,
                                        column=column, update=update, retry_failed=retry):
        status.update(1)
        if not obj:
            failed += 1
            status.set_description('Failed: %s. Queried' % failed, refresh=True)
@tweet.command('search')
@click.argument('query')
@@ -383,34 +437,6 @@ def reset_extractor(ctx):
    session = make_session(db)
    session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})
@main.command('limits')
@click.argument('url', required=False)
@click.pass_context
 def get_limits(ctx, url):
    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
    total = {}
    for worker in wq.queue:
        resp = worker.client.application.rate_limit_status()
        print('#'*20)
        print(worker.name)
        if url:
            limit = 'NOT FOUND'
            print('URL is: {}'.format(url))
            cat = url.split('/')[1]
            if cat in resp['resources']:
                limit = resp['resources'][cat].get(url, None) or resp['resources'][cat]
            else:
                print('Cat {} not found'.format(cat))
                continue
            for k in limit:
                total[k] = total.get(k, 0) + limit[k]
            print('{}: {}'.format(url, limit))
        else:
            print(json.dumps(resp, indent=2))
    if url:
        print('Total for {}: {}'.format(url, total))
@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False),
              help='''Issue a call to an endpoint of the Twitter API.''')
--- a/bitter/utils.py
+++ b/bitter/utils.py
@@ -13,6 +13,11 @@ import sqlalchemy
 import os
 import multiprocessing
 from multiprocessing.pool import ThreadPool
 from multiprocessing import Queue
 import queue
 import threading
 from select import select
 from functools import partial
@@ -22,6 +27,7 @@ from itertools import islice, chain
 from contextlib import contextmanager
 from collections import Counter
 from random import choice
 from builtins import map, filter
@@ -53,7 +59,7 @@ def chunk(iterable, n):
 def parallel(func, source, chunksize=1, numcpus=multiprocessing.cpu_count()):
    source = chunk(source, chunksize)
    p = ThreadPool(numcpus*2)
-    results = p.imap_unordered(func, source, chunksize=int(1000/numcpus))
+    results = p.imap_unordered(func, source)
    for i in chain.from_iterable(results):
        yield i
@@ -507,7 +513,8 @@ def id_failed(oid, folder):
 def tweet_download_batch(wq, batch):
    tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id']
-    return tweets.items()
+    for tid, tweet in tweets.items():
        yield tid, tweet
 def user_download_batch(wq, batch):
    screen_names = []
@@ -547,45 +554,81 @@ def user_download_batch(wq, batch):
        yield (name, None)
-def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=True,
+def dump_result(oid, obj, folder, ignore_fails=True):
    if obj:
        try:
            write_json(obj, folder=folder, oid=oid)
            failed = fail_file(oid, folder)
            if os.path.exists(failed):
                os.remove(failed)
        except Exception as ex:
            logger.error('%s: %s' % (oid, ex))
            if not ignore_fails:
                raise
    else:
        logger.info('Object not recovered: {}'.format(oid))
        with open(fail_file(oid, folder), 'w') as f:
            print('Object not found', file=f)
 def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=False,
                  batch_method=tweet_download_batch):
    def filter_lines(line):
        # print('Checking {}'.format(line))
        oid = line[0]
        if (cached_id(oid, folder) and not update) or (id_failed(oid, folder) and not retry_failed):
            yield None
        else:
            yield str(oid)
-    def print_result(res):
+    done = Queue()
        for oid, obj in res:
          if obj:
              try:
                  write_json(obj, folder=folder, oid=oid)
                  failed = fail_file(oid, folder)
                  if os.path.exists(failed):
                      os.remove(failed)
                  yield 1
              except Exception as ex:
                  logger.error('%s: %s' % (oid, ex))
                  if not ignore_fails:
                      raise
          else:
              logger.info('Object not recovered: {}'.format(oid))
              with open(fail_file(oid, folder), 'w') as f:
                  print('Object not found', file=f)
              yield -1
-    objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects'))
+    down = Queue()
-    batch_method = partial(batch_method, wq)
+
-    objects = parallel(batch_method, objects_to_crawl, 100)
+
-    failed = 0
+    def filter_list(lst, done, down):
-    pbar = tqdm(parallel(print_result, objects), desc='Queried')
+        print('filtering')
-    for res in pbar:
+        for oid in lst:
-        if res < 0:
+            # print('Checking {}'.format(line))
-            failed += 1
+            cached = cached_id(oid, folder)
-            pbar.set_description('Failed: %s. Queried' % failed, refresh=True)
+            if (cached and not update):
-        yield res
+                done.put((oid, cached))
            elif (id_failed(oid, folder) and not retry_failed):
                done.put((oid, None))
            else:
                down.put(oid)
        down.put(None)
    def download_results(batch_method, down, done):
        def gen():
            while True:
                r = down.get()
                if not r:
                    return
                yield r
        for t in parallel(batch_method, gen(), 100):
            done.put(t)
    def batch(*args, **kwargs):
        return batch_method(wq, *args, **kwargs)
    tc = threading.Thread(target=filter_list, args=(lst, done, down), daemon=True)
    tc.start()
    td = threading.Thread(target=download_results, args=(batch, down, done), daemon=True)
    td.start()
    def check_threads(ts, done):
        for t in ts:
            t.join()
        done.put(None)
    wait = threading.Thread(target=check_threads, args=([tc, td], done), daemon=True)
    wait.start()
    while True:
        rec = done.get()
        if rec is None:
            break
        oid, obj = rec
        dump_result(oid, obj, folder, ignore_fails)
        yield rec
    wait.join()
 def download_file(wq, csvfile, folder, column=0, delimiter=',',
@@ -595,8 +638,14 @@ def download_file(wq, csvfile, folder, column=0, delimiter=',',
        csvreader = csv.reader(f, delimiter=str(delimiter), quotechar=str(quotechar))
        if header:
            next(csvreader)
-        tweets = map(lambda row: row[0].strip(), csvreader)
+
-        for res in download_list(wq, tweets, folder, batch_method=batch_method,
+        def reader(r):
            for row in csvreader:
                if len(row) > column:
                    yield row[column].strip()
        for res in download_list(wq, reader(csvreader), folder, batch_method=batch_method,
                                 **kwargs):
            yield res