Improve download_list

2025-07-08 22:42:22 +00:00 · 2019-04-30 19:15:15 +02:00 · 2019-04-30 19:15:15 +02:00 · 653487e2d7
commit 653487e2d7
parent 02aec5eefa
4 changed files with 164 additions and 89 deletions
--- a/bitter/VERSION
+++ b/bitter/VERSION
@ -1 +1 @@
-0.9.3
+0.9.5
--- a/bitter/cli.py
+++ b/bitter/cli.py
@ -42,10 +42,58 @@ def main(ctx, verbose, logging_level, config, credentials):
      utils.copy_credentials_to_config(credentials, config)


-@main.group()
+@main.group(invoke_without_command=True)
@click.pass_context
 def credentials(ctx):
-    pass
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
+    for worker in wq.queue:
+        print('#'*20)
+        try:
+            resp = worker.client.application.rate_limit_status()
+            print(worker.name)
+        except Exception as ex:
+            print('{}: AUTHENTICATION ERROR: {}'.format(worker.name, ex) )
+
+
+@credentials.command('limits')
+@click.option('--all', type=bool, default=False, required=False,
+              help=('Print all limits. By default, it only limits that '
+                    'have been consumed will be shown.'))
+@click.argument('url', required=False)
+@click.pass_context
+def get_limits(ctx, all, url):
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
+    total = {}
+    for worker in wq.queue:
+        resp = worker.client.application.rate_limit_status()
+        print('#'*20)
+        print(worker.name)
+        if url:
+            limit = 'NOT FOUND'
+            print('URL is: {}'.format(url))
+            cat = url.split('/')[1]
+            if cat in resp['resources']:
+                limit = resp['resources'][cat].get(url, None) or resp['resources'][cat]
+            else:
+                print('Cat {} not found'.format(cat))
+                continue
+            for k in limit:
+                total[k] = total.get(k, 0) + limit[k]
+            print('{}: {}'.format(url, limit))
+            continue
+        nres = {}
+        if not all:
+            for res, urls in resp['resources'].items():
+                nurls = {}
+                for u, limits in urls.items():
+                    if limits['limit'] != limits['remaining']:
+                        nurls[u] = limits
+                if nurls:
+                    nres[res] = nurls
+            resp = nres
+        print(json.dumps(resp, indent=2))
+    if url:
+        print('Total for {}: {}'.format(url, total))

@credentials.command('add')
@click.option('--consumer_key', default=None)
@ -98,11 +146,17 @@ def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, header, quotec
        click.echo('Cancelling')
        return
    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
-    for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
+
+    status = tqdm('Queried')
+    failed = 0
+    for tid, obj in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
                                        batch_method=utils.tweet_download_batch,
                                        header=header, quotechar=quotechar,
                                        column=column, update=update, retry_failed=retry):
-        pass
+        status.update(1)
+        if not obj:
+            failed += 1
+            status.set_description('Failed: %s. Queried' % failed, refresh=True)

@tweet.command('search')
@click.argument('query')
@ -383,34 +437,6 @@ def reset_extractor(ctx):
    session = make_session(db)
    session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})

-@main.command('limits')
-@click.argument('url', required=False)
-@click.pass_context
-def get_limits(ctx, url):
-    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
-    total = {}
-    for worker in wq.queue:
-        resp = worker.client.application.rate_limit_status()
-        print('#'*20)
-        print(worker.name)
-        if url:
-            limit = 'NOT FOUND'
-            print('URL is: {}'.format(url))
-            cat = url.split('/')[1]
-            if cat in resp['resources']:
-                limit = resp['resources'][cat].get(url, None) or resp['resources'][cat]
-            else:
-                print('Cat {} not found'.format(cat))
-                continue
-            for k in limit:
-                total[k] = total.get(k, 0) + limit[k]
-            print('{}: {}'.format(url, limit))
-        else:
-            print(json.dumps(resp, indent=2))
-    if url:
-        print('Total for {}: {}'.format(url, total))
-
-

@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False),
              help='''Issue a call to an endpoint of the Twitter API.''')
--- a/bitter/utils.py
+++ b/bitter/utils.py
@ -13,6 +13,11 @@ import sqlalchemy
 import os
 import multiprocessing
 from multiprocessing.pool import ThreadPool
+from multiprocessing import Queue
+
+import queue
+import threading
+from select import select

 from functools import partial

@ -22,6 +27,7 @@ from itertools import islice, chain
 from contextlib import contextmanager

 from collections import Counter
+from random import choice

 from builtins import map, filter

@ -53,7 +59,7 @@ def chunk(iterable, n):
 def parallel(func, source, chunksize=1, numcpus=multiprocessing.cpu_count()):
    source = chunk(source, chunksize)
    p = ThreadPool(numcpus*2)
-    results = p.imap_unordered(func, source, chunksize=int(1000/numcpus))
+    results = p.imap_unordered(func, source)
    for i in chain.from_iterable(results):
        yield i

@ -507,7 +513,8 @@ def id_failed(oid, folder):

 def tweet_download_batch(wq, batch):
    tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id']
-    return tweets.items()
+    for tid, tweet in tweets.items():
+        yield tid, tweet

 def user_download_batch(wq, batch):
    screen_names = []
@ -547,25 +554,13 @@ def user_download_batch(wq, batch):
        yield (name, None)


-def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=True,
-                  batch_method=tweet_download_batch):
-    def filter_lines(line):
-        # print('Checking {}'.format(line))
-        oid = line[0]
-        if (cached_id(oid, folder) and not update) or (id_failed(oid, folder) and not retry_failed):
-            yield None
-        else:
-            yield str(oid)
-
-    def print_result(res):
-        for oid, obj in res:
+def dump_result(oid, obj, folder, ignore_fails=True):
    if obj:
        try:
            write_json(obj, folder=folder, oid=oid)
            failed = fail_file(oid, folder)
            if os.path.exists(failed):
                os.remove(failed)
-                  yield 1
        except Exception as ex:
            logger.error('%s: %s' % (oid, ex))
            if not ignore_fails:
@ -574,18 +569,66 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail
        logger.info('Object not recovered: {}'.format(oid))
        with open(fail_file(oid, folder), 'w') as f:
            print('Object not found', file=f)
-              yield -1

-    objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects'))
-    batch_method = partial(batch_method, wq)
-    objects = parallel(batch_method, objects_to_crawl, 100)
-    failed = 0
-    pbar = tqdm(parallel(print_result, objects), desc='Queried')
-    for res in pbar:
-        if res < 0:
-            failed += 1
-            pbar.set_description('Failed: %s. Queried' % failed, refresh=True)
-        yield res
+def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=False,
+                  batch_method=tweet_download_batch):
+
+    done = Queue()
+
+    down = Queue()
+
+
+    def filter_list(lst, done, down):
+        print('filtering')
+        for oid in lst:
+            # print('Checking {}'.format(line))
+            cached = cached_id(oid, folder)
+            if (cached and not update):
+                done.put((oid, cached))
+            elif (id_failed(oid, folder) and not retry_failed):
+                done.put((oid, None))
+            else:
+                down.put(oid)
+        down.put(None)
+
+    def download_results(batch_method, down, done):
+        def gen():
+            while True:
+                r = down.get()
+                if not r:
+                    return
+                yield r
+
+        for t in parallel(batch_method, gen(), 100):
+            done.put(t)
+
+    def batch(*args, **kwargs):
+        return batch_method(wq, *args, **kwargs)
+
+    tc = threading.Thread(target=filter_list, args=(lst, done, down), daemon=True)
+    tc.start()
+    td = threading.Thread(target=download_results, args=(batch, down, done), daemon=True)
+    td.start()
+
+    def check_threads(ts, done):
+        for t in ts:
+            t.join()
+        done.put(None)
+
+    wait = threading.Thread(target=check_threads, args=([tc, td], done), daemon=True)
+    wait.start()
+
+    while True:
+        rec = done.get()
+
+        if rec is None:
+            break
+
+        oid, obj = rec
+        dump_result(oid, obj, folder, ignore_fails)
+        yield rec
+
+    wait.join()


 def download_file(wq, csvfile, folder, column=0, delimiter=',',
@ -595,8 +638,14 @@ def download_file(wq, csvfile, folder, column=0, delimiter=',',
        csvreader = csv.reader(f, delimiter=str(delimiter), quotechar=str(quotechar))
        if header:
            next(csvreader)
-        tweets = map(lambda row: row[0].strip(), csvreader)
-        for res in download_list(wq, tweets, folder, batch_method=batch_method,
+
+        def reader(r):
+            for row in csvreader:
+                if len(row) > column:
+                    yield row[column].strip()
+
+
+        for res in download_list(wq, reader(csvreader), folder, batch_method=batch_method,
                                 **kwargs):
            yield res
 @ -1 +1 @@
 .9.3
 .9.5