From 030c41b826ae03f3993c558528707eafa14af51a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= <balkian@gmail.com>
Date: Tue, 7 Jan 2020 20:35:29 +0100
Subject: [PATCH] Changes to user and tweet search: Cache by default

* Improved printing of credential limits
* Tweet and user searchers cache by default. Write has been changed to dry_run
---
 bitter/VERSION  |  2 +-
 bitter/cli.py   | 96 +++++++++++++++++++++++--------------------------
 bitter/utils.py | 19 +++++++---
 3 files changed, 60 insertions(+), 57 deletions(-)

diff --git a/bitter/VERSION b/bitter/VERSION
index 5712157..5eef0f1 100644
--- a/bitter/VERSION
+++ b/bitter/VERSION
@@ -1 +1 @@
-0.10.1
+0.10.2
diff --git a/bitter/cli.py b/bitter/cli.py
index 3c1094a..1cc9106 100644
--- a/bitter/cli.py
+++ b/bitter/cli.py
@@ -93,6 +93,8 @@ def main(ctx, verbose, logging_level, config, credentials):
 @main.group(invoke_without_command=True)
 @click.pass_context
 def credentials(ctx):
+    if ctx.invoked_subcommand is not None:
+        return
     wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
     for worker in wq.queue:
         print('#'*20)
@@ -104,44 +106,47 @@ def credentials(ctx):
 
 
 @credentials.command('limits')
-@click.option('--all', type=bool, default=False, required=False,
-              help=('Print all limits. By default, it only limits that '
+@click.option('--no_aggregate', is_flag=True, default=False,
+              help=('Print limits from all workers. By default, limits are '
+                    'aggregated (summed).'))
+@click.option('--no_diff', is_flag=True, default=False,
+              help=('Print all limits. By default, only limits that '
                     'have been consumed will be shown.'))
 @click.argument('url', required=False)
 @click.pass_context
-def get_limits(ctx, all, url):
+def get_limits(ctx, no_aggregate, no_diff, url):
     wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
-    total = {}
+    limits = {}
+    if url:
+        print('URL is: {}'.format(url))
     for worker in wq.queue:
         resp = worker.client.application.rate_limit_status()
-        print('#'*20)
-        print(worker.name)
-        if url:
-            limit = 'NOT FOUND'
-            print('URL is: {}'.format(url))
-            cat = url.split('/')[1]
-            if cat in resp['resources']:
-                limit = resp['resources'][cat].get(url, None) or resp['resources'][cat]
-            else:
-                print('Cat {} not found'.format(cat))
-                continue
-            for k in limit:
-                total[k] = total.get(k, 0) + limit[k]
-            print('{}: {}'.format(url, limit))
-            continue
-        nres = {}
-        if not all:
-            for res, urls in resp['resources'].items():
-                nurls = {}
-                for u, limits in urls.items():
-                    if limits['limit'] != limits['remaining']:
-                        nurls[u] = limits
-                if nurls:
-                    nres[res] = nurls
-            resp = nres
-        print(json.dumps(resp, indent=2))
-    if url:
-        print('Total for {}: {}'.format(url, total))
+        for urlimits in resp['resources'].values():
+            for url, value in urlimits.items():
+                if url not in limits:
+                    limits[url] = {}
+                glob = limits[url].get('global', {})
+                limits[url][worker.name] = value
+                for k in ['limit', 'remaining']:
+                    if k not in glob:
+                        glob[k] = 0
+                    glob[k] += value[k]
+                limits[url]['global'] = glob
+    for url, lims in limits.items():
+        worker_list = lims.keys() if no_aggregate else ['global', ] 
+
+        url_printed = False
+
+        for worker in worker_list:
+            vals = lims[worker]
+            consumed = vals['limit'] - vals['remaining'] 
+            if no_diff or consumed:
+                if not url_printed:
+                    print(url)
+                    url_printed = True
+                print('\t', worker, ':')
+                print('\t\t', vals)
+
 
 @credentials.command('add')
 @click.option('--consumer_key', default=None)
@@ -169,14 +174,14 @@ def tweet(ctx):
     pass
 
 @tweet.command('get')
-@click.option('-w', '--write', is_flag=True, default=False)
+@click.option('-d', '--dry_run', is_flag=True, default=False)
 @click.option('-f', '--folder', default="tweets")
 @click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False)
 @click.argument('tweetid')
 @serialize
-def get_tweet(tweetid, write, folder, update):
+def get_tweet(tweetid, dry_run, folder, update):
     wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
-    yield from utils.download_tweet(wq, tweetid, write, folder, update)
+    yield from utils.download_tweet(wq, tweetid, not dry_run, folder, update)
 
 @tweet.command('get_all', help='''Download tweets from a list of tweets in a CSV file.
 The result is stored as individual json files in your folder of choice.''')
@@ -245,26 +250,13 @@ def list_users(ctx, db):
 
 @users.command('get')
 @click.argument('user')
-@click.option('-w', '--write', is_flag=True, default=False)
+@click.option('-d', '--dry_run', is_flag=True, default=False)
 @click.option('-f', '--folder', default="users")
 @click.option('-u', '--update', help="Update the file even if the user exists", is_flag=True, default=False)
-def get_user(user, write, folder, update):
+@serialize
+def get_user(user, dry_run, folder, update):
     wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
-    if not write:
-        u = utils.get_user(wq, user)
-        js = json.dumps(u, indent=2)
-        print(js)
-        return
-    if not os.path.exists(folder):
-        os.makedirs(folder)
-    file = os.path.join(folder, '%s.json' % user)
-    if not update and os.path.exists(file) and os.path.isfile(file):
-        print('User exists: %s' % user)
-        return
-    with open(file, 'w') as f:
-        u = utils.get_user(wq, user)
-        js = json.dumps(u, indent=2)
-        print(js, file=f)
+    yield from utils.download_user(wq, user, not dry_run, folder, update)
 
 @users.command('get_all', help='''Download users from a list of user ids/screen names in a CSV file.
                The result is stored as individual json files in your folder of choice.''')
diff --git a/bitter/utils.py b/bitter/utils.py
index 6a98c48..8f39470 100644
--- a/bitter/utils.py
+++ b/bitter/utils.py
@@ -277,8 +277,7 @@ def download_entry(wq, entry_id, dburi=None, recursive=False):
     download_user(wq, session, user, entry, recursive)
     session.close()
 
-
-def download_user(wq, session, user, entry=None, recursive=False, max_followers=50000):
+def crawl_user(wq, session, user, entry=None, recursive=False, max_followers=50000):
 
     total_followers = user.followers_count
 
@@ -478,13 +477,23 @@ def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=F
     tweet = cached_id(tweetid, folder)
     if update or not tweet:
         tweet = get_tweet(wq, tweetid)
-    if write:
+    if write and update:
         if tweet:
             js = json.dumps(tweet)
             write_json(js, folder)
     yield tweet
 
 
+def download_user(wq, userid, write=True, folder="downloaded_users", update=False):
+    user = cached_id(userid, folder)
+    if update or not user:
+        user = get_user(wq, userid)
+    if write and update:
+        if user:
+            write_json(user, folder, aliases=[user['screen_name'], ])
+    yield user
+
+
 def cached_id(oid, folder):
     tweet = None
     file = os.path.join(folder, '%s.json' % oid)
@@ -497,7 +506,7 @@ def cached_id(oid, folder):
             logger.error('Error getting cached version of {}: {}'.format(oid, ex))
     return tweet
 
-def write_json(js, folder, oid=None):
+def write_json(js, folder, oid=None, aliases=[]):
     if not oid:
       oid = js['id']
     file = id_file(oid, folder)
@@ -506,6 +515,8 @@ def write_json(js, folder, oid=None):
     with open(file, 'w') as f:
         json.dump(js, f)
         logger.info('Written {} to file {}'.format(oid, file))
+    for alias in aliases:
+        os.symlink('%s.json' % oid, id_file(alias, folder))
 
 def id_file(oid, folder):
     return os.path.join(folder, '%s.json' % oid)