From 5199d5b5aac5b3e1647fd469cff05cfb73e6127d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= <balkian@gmail.com>
Date: Tue, 20 Mar 2018 13:29:18 +0100
Subject: [PATCH] Improve CLI. Add credentials

---
 README.md           |  82 +++++++++++++++++++++++-
 bitter/VERSION      |   2 +-
 bitter/cli.py       |  98 ++++++++++++++++++++++-------
 bitter/crawlers.py  |  10 +--
 bitter/utils.py     | 148 +++++++++++++++++++++++++++++---------------
 tests/test_utils.py |   4 +-
 6 files changed, 264 insertions(+), 80 deletions(-)
diff --git a/README.md b/README.md
index 0a9a2e2..37df3a0 100644
--- a/README.md
+++ b/README.md
@@ -32,9 +32,89 @@ bitter api statuses/user_timeline --id thepsf --count 500
 ```
 
 
+## Adding credentials
+
+```
+bitter --config <YOUR CONFIGURATION FILE> credentials add
+```
+
+You can specify the parameters in the command or let the command line guide you through the process.
+
 # Examples
 
-The CLI can query the rest API:
+## Downloading a list of tweets
+
+Bitter can download tweets from a list of tweets in a CSV file.
+The result is stored as individual json files in your folder of choice.
+You can even specify the column number for tweet ids.
+Bitter will not try to download 
+
+```
+Usage: bitter tweet get_all [OPTIONS] TWEETSFILE
+
+  Download tweets from a list of tweets in a CSV file. The result is stored
+  as individual json files in your folder of choice.
+
+Options:
+  -f, --folder TEXT
+  -d, --delimiter TEXT
+  -h, --header          Discard the first line (use it as a header)
+  -q, --quotechar TEXT
+  -c, --column INTEGER
+  --help                Show this message and exit.
+
+```
+
+For instance, this will download `tweet_ids.csv` in the `tweet_info` folder:
+
+```
+bitter tweet get_all -f tweet_info tweet_ids.csv
+```
+
+## Downloading a list of users
+
+Bitter downloads users and tweets in a similar way:
+
+```
+Usage: bitter users get_all [OPTIONS] USERSFILE
+
+  Download users from a list of user ids/screen names in a CSV file. The
+  result is stored as individual json files in your folder of choice.
+
+Options:
+  -f, --folder TEXT
+  -d, --delimiter TEXT
+  -h, --header          Discard the first line (use it as a header)
+  -q, --quotechar TEXT
+  -c, --column INTEGER
+  --help                Show this message and exit.
+```
+
+The only difference is that users can be downloaded via `screen_name` or `user_id`.
+This method does not try to resolve screen names to user ids, so users may be downloaded more than once if they appear in both ways.
+
+## Downloading a stream
+
+```
+Usage: bitter stream get [OPTIONS]
+
+Options:
+  -l, --locations TEXT
+  -t, --track TEXT
+  -f, --file TEXT       File to store the stream of tweets. Default: standard output
+  -p, --politelyretry   Politely retry after a hangup/connection error
+  --help                Show this message and exit.
+```
+
+```
+bitter --config .bitter.yaml stream get 
+```
+python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines
+
+
+## REST queries
+
+In newer versions of bitter, individual methods to download tweets/users using the REST API are being replaced with a generic method to call the API.
 
 ```
 bitter api <URL endpoint> --parameter VALUE ... | [--tweets | --users] [--max_count MAX_COUNT] [--count COUNT_PER_CALL]
diff --git a/bitter/VERSION b/bitter/VERSION
index a3df0a6..ac39a10 100644
--- a/bitter/VERSION
+++ b/bitter/VERSION
@@ -1 +1 @@
-0.8.0
+0.9.0
diff --git a/bitter/cli.py b/bitter/cli.py
index 92d395a..493bcb9 100644
--- a/bitter/cli.py
+++ b/bitter/cli.py
@@ -41,6 +41,32 @@ def main(ctx, verbose, logging_level, config, credentials):
     if os.path.exists(utils.get_config_path(credentials)):
       utils.copy_credentials_to_config(credentials, config)
 
+
+@main.group()
+@click.pass_context 
+def credentials(ctx):
+    pass
+
+@credentials.command('add')
+@click.option('--consumer_key', default=None)
+@click.option('--consumer_secret', default=None)
+@click.option('--token_key', default=None)
+@click.option('--token_secret', default=None)
+@click.argument('user_name')
+def add(user_name, consumer_key, consumer_secret, token_key, token_secret):
+    if not consumer_key:
+        consumer_key = click.prompt('Please, enter your YOUR CONSUMER KEY')
+    if not consumer_secret:
+        consumer_secret = click.prompt('Please, enter your CONSUMER SECRET')
+    if not token_key:
+        token_key = click.prompt('Please, enter your ACCESS TOKEN')
+    if not token_secret:
+        token_secret = click.prompt('Please, enter your ACCESS TOKEN SECRET')
+    utils.add_credentials(conffile=bconf.CONFIG_FILE, user=user_name, consumer_key=consumer_key, consumer_secret=consumer_secret,
+                          token_key=token_key, token_secret=token_secret)
+    click.echo('Credentials added for {}'.format(user_name))
+
+
 @main.group()
 @click.pass_context 
 def tweet(ctx):
@@ -52,22 +78,36 @@ def tweet(ctx):
 @click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False)
 @click.argument('tweetid')
 def get_tweet(tweetid, write, folder, update):
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
     utils.download_tweet(wq, tweetid, write, folder, update)
-        
-@tweet.command('get_all')
+
+@tweet.command('get_all', help='''Download tweets from a list of tweets in a CSV file.
+The result is stored as individual json files in your folder of choice.''')
 @click.argument('tweetsfile', 'File with a list of tweets to look up')
 @click.option('-f', '--folder', default="tweets")
+@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
+@click.option('-d', '--delimiter', default=",")
+@click.option('-h', '--header', help='Discard the first line (use it as a header)',
+              is_flag=True, default=False)
+@click.option('-q', '--quotechar', default='"')
+@click.option('-c', '--column', type=int, default=0)
 @click.pass_context
-def get_tweets(ctx, tweetsfile, folder):
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
-    utils.download_tweets(wq, tweetsfile, folder)
+def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, column):
+    if update and not click.confirm('This may overwrite existing tweets. Continue?'):
+        click.echo('Cancelling')
+        return
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
+    for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
+                                 batch_method=utils.tweet_download_batch,
+                                 header=header, quotechar=quotechar,
+                                 column=column, update=update):
+        pass
 
 @tweet.command('search')
 @click.argument('query')
 @click.pass_context 
 def search(ctx, query):
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
     t = utils.search_tweet(wq, query)
     print(json.dumps(t, indent=2))
 
@@ -75,7 +115,7 @@ def search(ctx, query):
 @click.argument('user')
 @click.pass_context 
 def timeline(ctx, user):
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
     t = utils.user_timeline(wq, user)
     print(json.dumps(t, indent=2))
 
@@ -101,7 +141,7 @@ def list_users(ctx, db):
 @click.option('-f', '--folder', default="users")
 @click.option('-u', '--update', help="Update the file even if the user exists", is_flag=True, default=False)
 def get_user(user, write, folder, update):
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
     if not write:
         u = utils.get_user(wq, user)
         js = json.dumps(u, indent=2)
@@ -118,15 +158,28 @@ def get_user(user, write, folder, update):
         js = json.dumps(u, indent=2)
         print(js, file=f)
 
-@users.command('get_all')
+@users.command('get_all', help='''Download users from a list of user ids/screen names in a CSV file.
+               The result is stored as individual json files in your folder of choice.''')
 @click.argument('usersfile', 'File with a list of users to look up')
 @click.option('-f', '--folder', default="users")
+@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
+@click.option('-d', '--delimiter', default=",")
+@click.option('-h', '--header', help='Discard the first line (use it as a header)',
+              is_flag=True, default=False)
+@click.option('-q', '--quotechar', default='"')
+@click.option('-c', '--column', type=int, default=0)
 @click.pass_context
-def get_users(ctx, usersfile, folder):
-    with open(usersfile) as f:
-        for line in f:
-            uid = line.strip()
-            ctx.invoke(get_user, folder=folder, user=uid, write=True)
+def get_users(ctx, usersfile, folder, update, delimiter, header, quotechar, column):
+    if update and not click.confirm('This may overwrite existing users. Continue?'):
+        click.echo('Cancelling')
+        return
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
+    for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter,
+                                 batch_method=utils.user_download_batch,
+                                 update=update,
+                                 header=header, quotechar=quotechar,
+                                 column=column):
+        pass
 
 @users.command('crawl')
 @click.option('--db', required=True, help='Database to save all users.')
@@ -147,7 +200,7 @@ def crawl_users(ctx, usersfile, skip, until, threads, db):
             return ExitStack()
 
 
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
     logger.info('Starting Network crawler with {} threads and {} credentials.'.format(threads,
                                                                                       len(wq.queue)))
 
@@ -311,7 +364,7 @@ def users_extractor(ctx):
 @click.pass_context
 def extract(ctx, recursive, user, name, initfile):
     print(locals())
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
     dburi = ctx.obj['DBURI']
     utils.extract(wq,
                   recursive=recursive,
@@ -323,7 +376,7 @@ def extract(ctx, recursive, user, name, initfile):
 @extractor.command('reset')
 @click.pass_context
 def reset_extractor(ctx):
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
     db = ctx.obj['DBURI']
     session = make_session(db)
     session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})
@@ -332,7 +385,7 @@ def reset_extractor(ctx):
 @click.argument('url', required=False)
 @click.pass_context
 def get_limits(ctx, url):
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
     total = {}
     for worker in wq.queue:
         resp = worker.client.application.rate_limit_status()
@@ -357,7 +410,8 @@ def get_limits(ctx, url):
 
 
 
-@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False))
+@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False),
+              help='''Issue a call to an endpoint of the Twitter API.''')
 @click.argument('cmd', nargs=1)
 @click.option('--tweets', is_flag=True, help='Fetch more tweets using smart pagination. Use --count to control how many tweets to fetch per call, and --max_count to set the number of desired tweets (or -1 to get as many as possible).', type=bool, default=False)
 @click.option('--users', is_flag=True, help='Fetch more users using smart pagination. Use --count to control how many users to fetch per call, and --max_count to set the number of desired users (or -1 to get as many as possible).', type=bool, default=False)
@@ -374,7 +428,7 @@ def api(ctx, cmd, tweets, users, api_args):
         if k in mappings:
             k = mappings[k]
         opts[k] = v
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
     if tweets:
         resp = utils.consume_tweets(wq[cmd], **opts)
     elif users:
@@ -409,7 +463,7 @@ def stream(ctx):
 @click.option('-p', '--politelyretry', help='Politely retry after a hangup/connection error', is_flag=True, default=True)
 @click.pass_context 
 def get_stream(ctx, locations, track, file, politelyretry):
-    wq = crawlers.StreamQueue.from_config(bconf.CONFIG_FILE, 1)
+    wq = crawlers.StreamQueue.from_config(conffile=bconf.CONFIG_FILE, max_workers=1)
 
     query_args = {}
     if locations:
diff --git a/bitter/crawlers.py b/bitter/crawlers.py
index 8ebd0c8..399394b 100644
--- a/bitter/crawlers.py
+++ b/bitter/crawlers.py
@@ -61,12 +61,14 @@ class FromCredentialsMixin(object):
 class FromConfigMixin(object):
 
     @classmethod
-    def from_config(cls, conffile=None, max_workers=None):
+    def from_config(cls, config=None, conffile=None, max_workers=None):
         wq = cls()
 
-        with utils.config(conffile) as c:
-          for cred in islice(c['credentials'], max_workers):
-              wq.ready(cls.worker_class(cred["user"], cred))
+        if not config:
+          with utils.config(conffile) as c:
+              config = c
+        for cred in islice(config['credentials'], max_workers):
+            wq.ready(cls.worker_class(cred["user"], cred))
         return wq
 
 class TwitterWorker(object):
diff --git a/bitter/utils.py b/bitter/utils.py
index d520e39..02f9368 100644
--- a/bitter/utils.py
+++ b/bitter/utils.py
@@ -4,6 +4,7 @@ import logging
 import time
 import json
 import yaml
+import csv
 import io
 
 import signal
@@ -93,7 +94,7 @@ def read_config(conffile):
     p = conffile and get_config_path(conffile)
     if p:
         if not os.path.exists(p):
-            raise Exception('{} file does not exist.'.format(p))
+            raise IOError('{} file does not exist.'.format(p))
         f = open(p, 'r')
     elif 'BITTER_CONFIG' not in os.environ:
         raise Exception('No config file or BITTER_CONFIG env variable.')
@@ -103,6 +104,8 @@ def read_config(conffile):
 
 
 def write_config(conf, conffile=None):
+    if not conf:
+        conf = {'credentials': []}
     if conffile:
         p = get_config_path(conffile)
         with open(p, 'w') as f:
@@ -122,6 +125,7 @@ def create_config_file(conffile=None):
     conffile = get_config_path(conffile)
     with open(conffile, 'a'):
         pass
+    write_config(None, conffile)
 
 
 def get_credentials(conffile=None, inverse=False, **kwargs):
@@ -142,7 +146,11 @@ def delete_credentials(conffile=None, **creds):
 
 
 def add_credentials(conffile=None, **creds):
-    exist = get_credentials(conffile, **creds)
+    try:
+        exist = get_credentials(conffile, **creds)
+    except IOError:
+        exist = False
+        create_config_file(conffile)
     if exist:
         return
     with config(conffile) as c:
@@ -451,86 +459,128 @@ def get_user(c, user):
         return c.users.lookup(screen_name=user)[0]
 
 def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False):
-    cached = cached_tweet(tweetid, folder)
+    cached = cached_id(tweetid, folder)
     tweet = None
     if update or not cached:
         tweet = get_tweet(wq, tweetid)
-        js = json.dumps(tweet, indent=2)
+        js = json.dumps(tweet)
     if write:
         if tweet:
-            write_tweet_json(js, folder)
+            write_json(js, folder)
     else:
         print(js)
 
 
-def cached_tweet(tweetid, folder):
+def cached_id(oid, folder):
     tweet = None
-    file = os.path.join(folder, '%s.json' % tweetid)
+    file = os.path.join(folder, '%s.json' % oid)
     if os.path.exists(file) and os.path.isfile(file):
         try:
-            # print('%s: Tweet exists' % tweetid)
+            # print('%s: Object exists' % oid)
             with open(file) as f:
                 tweet = json.load(f)
         except Exception as ex:
-            logger.error('Error getting cached version of {}: {}'.format(tweetid, ex))
+            logger.error('Error getting cached version of {}: {}'.format(oid, ex))
     return tweet
 
-def write_tweet_json(js, folder):
-    tweetid = js['id']
-    file = tweet_file(tweetid, folder)
+def write_json(js, folder, oid=None):
+    if not oid:
+      oid = js['id']
+    file = id_file(oid, folder)
     if not os.path.exists(folder):
         os.makedirs(folder)
     with open(file, 'w') as f:
-        json.dump(js, f, indent=2)
-        logger.info('Written {} to file {}'.format(tweetid, file))
+        json.dump(js, f)
+        logger.info('Written {} to file {}'.format(oid, file))
 
-def tweet_file(tweetid, folder):
-    return os.path.join(folder, '%s.json' % tweetid)
+def id_file(oid, folder):
+    return os.path.join(folder, '%s.json' % oid)
 
-def tweet_fail_file(tweetid, folder):
+def fail_file(oid, folder):
     failsfolder = os.path.join(folder, 'failed')
     if not os.path.exists(failsfolder):
         os.makedirs(failsfolder)
-    return os.path.join(failsfolder, '%s.failed' % tweetid)
+    return os.path.join(failsfolder, '%s.failed' % oid)
 
-def tweet_failed(tweetid, folder):
-    return os.path.isfile(tweet_fail_file(tweetid, folder))
+def id_failed(oid, folder):
+    return os.path.isfile(fail_file(oid, folder))
 
-def download_tweets(wq, tweetsfile, folder, update=False, retry_failed=False, ignore_fails=True):
-    def filter_line(line):
-        tweetid = int(line)
-        # print('Checking {}'.format(tweetid))
-        if (cached_tweet(tweetid, folder) and not update) or (tweet_failed(tweetid, folder) and not retry_failed):
+def tweet_download_batch(wq, batch):
+    tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id']
+    return tweets.items()
+
+def user_download_batch(wq, batch):
+    screen_names = []
+    user_ids = []
+    for elem in batch:
+        try:
+            user_ids.append(int(elem))
+        except ValueError:
+            screen_names.append(elem)
+    print('Downloading: {} - {}'.format(user_ids, screen_names))
+    users = wq.users.lookup(user_id=",".join(user_ids), screen_name=",".join(screen_names))
+    found_ids = []
+    found_names = []
+    for user in users:
+        uid = user['id']
+        if uid in user_ids:
+            found_ids.append(uid)
+            yield (uid, user)
+        uname = user['screen_name']
+        if uname in screen_names:
+            found_names.append(uname)
+            yield (uname, user)
+    for uid in set(user_ids) - set(found_ids):
+        yield (uid, None)
+    for name in set(screen_names) - set(found_names):
+        yield (name, None)
+
+
+def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=True,
+                  batch_method=tweet_download_batch):
+    def filter_lines(line):
+        # print('Checking {}'.format(line))
+        oid = line[0]
+        if (cached_id(oid, folder) and not update) or (id_failed(oid, folder) and not retry_failed):
             yield None
         else:
-            yield line
+            yield str(oid)
 
     def print_result(res):
-        tid, tweet = res
-        if tweet:
-            try:
-                write_tweet_json(tweet, folder=folder)
-                yield 1
-            except Exception as ex:
-                logger.error('%s: %s' % (tid, ex))
-                if not ignore_fails:
-                    raise
-        else:
-            logger.info('Tweet not recovered: {}'.format(tid))
-            with open(tweet_fail_file(tid, folder), 'w') as f:
-                print('Tweet not found', file=f)
-            yield -1
+        for oid, obj in res:
+          if obj:
+              try:
+                  write_json(obj, folder=folder, oid=oid)
+                  yield 1
+              except Exception as ex:
+                  logger.error('%s: %s' % (oid, ex))
+                  if not ignore_fails:
+                      raise
+          else:
+              logger.info('Object not recovered: {}'.format(oid))
+              with open(fail_file(oid, folder), 'w') as f:
+                  print('Object not found', file=f)
+              yield -1
 
-    def download_batch(batch):
-        tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id']
-        return tweets.items()
+    objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects'))
+    batch_method = partial(batch_method, wq)
+    tweets = parallel(batch_method, objects_to_crawl, 100)
+    for res in tqdm(parallel(print_result, tweets), desc='Queried'):
+        yield res
+
+
+def download_file(wq, csvfile, folder, column=0, delimiter=',',
+                  header=False, quotechar='"', batch_method=tweet_download_batch,
+                  **kwargs):
+    with open(csvfile) as f:
+        csvreader = csv.reader(f, delimiter=str(delimiter), quotechar=str(quotechar))
+        if header:
+            next(csvreader)
+        tweets = map(lambda row: row[0].strip(), csvreader)
+        for res in download_list(wq, tweets, folder, batch_method=batch_method,
+                                 **kwargs):
+            yield res
 
-    with open(tweetsfile) as f:
-        lines = map(lambda x: x.strip(), f)
-        lines_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_line, lines), desc='Total lines'))
-        tweets = parallel(download_batch, lines_to_crawl, 100)
-        for res in tqdm(parallel(print_result, tweets), desc='Queried'):
-            pass
 
 def download_timeline(wq, user):
     return wq.statuses.user_timeline(id=user)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index e052ba0..417f1d4 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -54,7 +54,7 @@ class TestUtils(TestCase):
         toc = time.time()
         assert (tic-toc) < 600
         resp2 = utils.parallel(echo, [1,2,3,4], chunksize=2)
-        assert list(resp2) == [1,2,3,4]
+        assert list(resp2) == [1,2, 3,4]
 
 
 class TestUtilsEnv(TestUtils):
@@ -68,5 +68,3 @@ class TestUtilsEnv(TestUtils):
     def tearDown(self):
         if hasattr(self, 'oldenv'):
             os.environ['BITTER_CONFIG'] = self.oldenv
-        
-