Improve CLI. Add credentials

2025-12-16 03:18:15 +00:00 · 2018-03-20 13:29:18 +01:00
parent 6259013978
commit 5199d5b5aa
6 changed files with 264 additions and 80 deletions
--- a/README.md
+++ b/README.md
@@ -32,9 +32,89 @@ bitter api statuses/user_timeline --id thepsf --count 500
 ```
 ## Adding credentials
 ```
 bitter --config <YOUR CONFIGURATION FILE> credentials add
 ```
 You can specify the parameters in the command or let the command line guide you through the process.
 # Examples
-The CLI can query the rest API:
+## Downloading a list of tweets
 Bitter can download tweets from a list of tweets in a CSV file.
 The result is stored as individual json files in your folder of choice.
 You can even specify the column number for tweet ids.
 Bitter will not try to download 
 ```
 Usage: bitter tweet get_all [OPTIONS] TWEETSFILE
  Download tweets from a list of tweets in a CSV file. The result is stored
  as individual json files in your folder of choice.
 Options:
  -f, --folder TEXT
  -d, --delimiter TEXT
  -h, --header          Discard the first line (use it as a header)
  -q, --quotechar TEXT
  -c, --column INTEGER
  --help                Show this message and exit.
 ```
 For instance, this will download `tweet_ids.csv` in the `tweet_info` folder:
 ```
 bitter tweet get_all -f tweet_info tweet_ids.csv
 ```
 ## Downloading a list of users
 Bitter downloads users and tweets in a similar way:
 ```
 Usage: bitter users get_all [OPTIONS] USERSFILE
  Download users from a list of user ids/screen names in a CSV file. The
  result is stored as individual json files in your folder of choice.
 Options:
  -f, --folder TEXT
  -d, --delimiter TEXT
  -h, --header          Discard the first line (use it as a header)
  -q, --quotechar TEXT
  -c, --column INTEGER
  --help                Show this message and exit.
 ```
 The only difference is that users can be downloaded via `screen_name` or `user_id`.
 This method does not try to resolve screen names to user ids, so users may be downloaded more than once if they appear in both ways.
 ## Downloading a stream
 ```
 Usage: bitter stream get [OPTIONS]
 Options:
  -l, --locations TEXT
  -t, --track TEXT
  -f, --file TEXT       File to store the stream of tweets. Default: standard output
  -p, --politelyretry   Politely retry after a hangup/connection error
  --help                Show this message and exit.
 ```
 ```
 bitter --config .bitter.yaml stream get 
 ```
 python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines
 ## REST queries
 In newer versions of bitter, individual methods to download tweets/users using the REST API are being replaced with a generic method to call the API.
 ```
 bitter api <URL endpoint> --parameter VALUE ... | [--tweets | --users] [--max_count MAX_COUNT] [--count COUNT_PER_CALL]
--- a/bitter/VERSION
+++ b/bitter/VERSION
@@ -1 +1 @@
-0.8.0
+0.9.0
--- a/bitter/cli.py
+++ b/bitter/cli.py
@@ -41,6 +41,32 @@ def main(ctx, verbose, logging_level, config, credentials):
    if os.path.exists(utils.get_config_path(credentials)):
      utils.copy_credentials_to_config(credentials, config)
@main.group()
@click.pass_context 
 def credentials(ctx):
    pass
@credentials.command('add')
@click.option('--consumer_key', default=None)
@click.option('--consumer_secret', default=None)
@click.option('--token_key', default=None)
@click.option('--token_secret', default=None)
@click.argument('user_name')
 def add(user_name, consumer_key, consumer_secret, token_key, token_secret):
    if not consumer_key:
        consumer_key = click.prompt('Please, enter your YOUR CONSUMER KEY')
    if not consumer_secret:
        consumer_secret = click.prompt('Please, enter your CONSUMER SECRET')
    if not token_key:
        token_key = click.prompt('Please, enter your ACCESS TOKEN')
    if not token_secret:
        token_secret = click.prompt('Please, enter your ACCESS TOKEN SECRET')
    utils.add_credentials(conffile=bconf.CONFIG_FILE, user=user_name, consumer_key=consumer_key, consumer_secret=consumer_secret,
                          token_key=token_key, token_secret=token_secret)
    click.echo('Credentials added for {}'.format(user_name))
@main.group()
@click.pass_context 
 def tweet(ctx):
@@ -52,22 +78,36 @@ def tweet(ctx):
@click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False)
@click.argument('tweetid')
 def get_tweet(tweetid, write, folder, update):
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
    utils.download_tweet(wq, tweetid, write, folder, update)
-@tweet.command('get_all')
+@tweet.command('get_all', help='''Download tweets from a list of tweets in a CSV file.
 The result is stored as individual json files in your folder of choice.''')
@click.argument('tweetsfile', 'File with a list of tweets to look up')
@click.option('-f', '--folder', default="tweets")
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
@click.option('-d', '--delimiter', default=",")
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
              is_flag=True, default=False)
@click.option('-q', '--quotechar', default='"')
@click.option('-c', '--column', type=int, default=0)
@click.pass_context
-def get_tweets(ctx, tweetsfile, folder):
+def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, column):
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    if update and not click.confirm('This may overwrite existing tweets. Continue?'):
-    utils.download_tweets(wq, tweetsfile, folder)
+        click.echo('Cancelling')
        return
    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
    for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
                                 batch_method=utils.tweet_download_batch,
                                 header=header, quotechar=quotechar,
                                 column=column, update=update):
        pass
@tweet.command('search')
@click.argument('query')
@click.pass_context 
 def search(ctx, query):
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
    t = utils.search_tweet(wq, query)
    print(json.dumps(t, indent=2))
@@ -75,7 +115,7 @@ def search(ctx, query):
@click.argument('user')
@click.pass_context 
 def timeline(ctx, user):
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
    t = utils.user_timeline(wq, user)
    print(json.dumps(t, indent=2))
@@ -101,7 +141,7 @@ def list_users(ctx, db):
@click.option('-f', '--folder', default="users")
@click.option('-u', '--update', help="Update the file even if the user exists", is_flag=True, default=False)
 def get_user(user, write, folder, update):
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
    if not write:
        u = utils.get_user(wq, user)
        js = json.dumps(u, indent=2)
@@ -118,15 +158,28 @@ def get_user(user, write, folder, update):
        js = json.dumps(u, indent=2)
        print(js, file=f)
-@users.command('get_all')
+@users.command('get_all', help='''Download users from a list of user ids/screen names in a CSV file.
               The result is stored as individual json files in your folder of choice.''')
@click.argument('usersfile', 'File with a list of users to look up')
@click.option('-f', '--folder', default="users")
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
@click.option('-d', '--delimiter', default=",")
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
              is_flag=True, default=False)
@click.option('-q', '--quotechar', default='"')
@click.option('-c', '--column', type=int, default=0)
@click.pass_context
-def get_users(ctx, usersfile, folder):
+def get_users(ctx, usersfile, folder, update, delimiter, header, quotechar, column):
-    with open(usersfile) as f:
+    if update and not click.confirm('This may overwrite existing users. Continue?'):
-        for line in f:
+        click.echo('Cancelling')
-            uid = line.strip()
+        return
-            ctx.invoke(get_user, folder=folder, user=uid, write=True)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
    for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter,
                                 batch_method=utils.user_download_batch,
                                 update=update,
                                 header=header, quotechar=quotechar,
                                 column=column):
        pass
@users.command('crawl')
@click.option('--db', required=True, help='Database to save all users.')
@@ -147,7 +200,7 @@ def crawl_users(ctx, usersfile, skip, until, threads, db):
            return ExitStack()
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
    logger.info('Starting Network crawler with {} threads and {} credentials.'.format(threads,
                                                                                      len(wq.queue)))
@@ -311,7 +364,7 @@ def users_extractor(ctx):
@click.pass_context
 def extract(ctx, recursive, user, name, initfile):
    print(locals())
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
    dburi = ctx.obj['DBURI']
    utils.extract(wq,
                  recursive=recursive,
@@ -323,7 +376,7 @@ def extract(ctx, recursive, user, name, initfile):
@extractor.command('reset')
@click.pass_context
 def reset_extractor(ctx):
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
    db = ctx.obj['DBURI']
    session = make_session(db)
    session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})
@@ -332,7 +385,7 @@ def reset_extractor(ctx):
@click.argument('url', required=False)
@click.pass_context
 def get_limits(ctx, url):
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
    total = {}
    for worker in wq.queue:
        resp = worker.client.application.rate_limit_status()
@@ -357,7 +410,8 @@ def get_limits(ctx, url):
-@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False))
+@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False),
              help='''Issue a call to an endpoint of the Twitter API.''')
@click.argument('cmd', nargs=1)
@click.option('--tweets', is_flag=True, help='Fetch more tweets using smart pagination. Use --count to control how many tweets to fetch per call, and --max_count to set the number of desired tweets (or -1 to get as many as possible).', type=bool, default=False)
@click.option('--users', is_flag=True, help='Fetch more users using smart pagination. Use --count to control how many users to fetch per call, and --max_count to set the number of desired users (or -1 to get as many as possible).', type=bool, default=False)
@@ -374,7 +428,7 @@ def api(ctx, cmd, tweets, users, api_args):
        if k in mappings:
            k = mappings[k]
        opts[k] = v
-    wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
+    wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
    if tweets:
        resp = utils.consume_tweets(wq[cmd], **opts)
    elif users:
@@ -409,7 +463,7 @@ def stream(ctx):
@click.option('-p', '--politelyretry', help='Politely retry after a hangup/connection error', is_flag=True, default=True)
@click.pass_context 
 def get_stream(ctx, locations, track, file, politelyretry):
-    wq = crawlers.StreamQueue.from_config(bconf.CONFIG_FILE, 1)
+    wq = crawlers.StreamQueue.from_config(conffile=bconf.CONFIG_FILE, max_workers=1)
    query_args = {}
    if locations:
--- a/bitter/crawlers.py
+++ b/bitter/crawlers.py
@@ -61,11 +61,13 @@ class FromCredentialsMixin(object):
 class FromConfigMixin(object):
    @classmethod
-    def from_config(cls, conffile=None, max_workers=None):
+    def from_config(cls, config=None, conffile=None, max_workers=None):
        wq = cls()
        if not config:
          with utils.config(conffile) as c:
-          for cred in islice(c['credentials'], max_workers):
+              config = c
        for cred in islice(config['credentials'], max_workers):
            wq.ready(cls.worker_class(cred["user"], cred))
        return wq
--- a/bitter/utils.py
+++ b/bitter/utils.py
@@ -4,6 +4,7 @@ import logging
 import time
 import json
 import yaml
 import csv
 import io
 import signal
@@ -93,7 +94,7 @@ def read_config(conffile):
    p = conffile and get_config_path(conffile)
    if p:
        if not os.path.exists(p):
-            raise Exception('{} file does not exist.'.format(p))
+            raise IOError('{} file does not exist.'.format(p))
        f = open(p, 'r')
    elif 'BITTER_CONFIG' not in os.environ:
        raise Exception('No config file or BITTER_CONFIG env variable.')
@@ -103,6 +104,8 @@ def read_config(conffile):
 def write_config(conf, conffile=None):
    if not conf:
        conf = {'credentials': []}
    if conffile:
        p = get_config_path(conffile)
        with open(p, 'w') as f:
@@ -122,6 +125,7 @@ def create_config_file(conffile=None):
    conffile = get_config_path(conffile)
    with open(conffile, 'a'):
        pass
    write_config(None, conffile)
 def get_credentials(conffile=None, inverse=False, **kwargs):
@@ -142,7 +146,11 @@ def delete_credentials(conffile=None, **creds):
 def add_credentials(conffile=None, **creds):
    try:
        exist = get_credentials(conffile, **creds)
    except IOError:
        exist = False
        create_config_file(conffile)
    if exist:
        return
    with config(conffile) as c:
@@ -451,86 +459,128 @@ def get_user(c, user):
        return c.users.lookup(screen_name=user)[0]
 def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False):
-    cached = cached_tweet(tweetid, folder)
+    cached = cached_id(tweetid, folder)
    tweet = None
    if update or not cached:
        tweet = get_tweet(wq, tweetid)
-        js = json.dumps(tweet, indent=2)
+        js = json.dumps(tweet)
    if write:
        if tweet:
-            write_tweet_json(js, folder)
+            write_json(js, folder)
    else:
        print(js)
-def cached_tweet(tweetid, folder):
+def cached_id(oid, folder):
    tweet = None
-    file = os.path.join(folder, '%s.json' % tweetid)
+    file = os.path.join(folder, '%s.json' % oid)
    if os.path.exists(file) and os.path.isfile(file):
        try:
-            # print('%s: Tweet exists' % tweetid)
+            # print('%s: Object exists' % oid)
            with open(file) as f:
                tweet = json.load(f)
        except Exception as ex:
-            logger.error('Error getting cached version of {}: {}'.format(tweetid, ex))
+            logger.error('Error getting cached version of {}: {}'.format(oid, ex))
    return tweet
-def write_tweet_json(js, folder):
+def write_json(js, folder, oid=None):
-    tweetid = js['id']
+    if not oid:
-    file = tweet_file(tweetid, folder)
+      oid = js['id']
    file = id_file(oid, folder)
    if not os.path.exists(folder):
        os.makedirs(folder)
    with open(file, 'w') as f:
-        json.dump(js, f, indent=2)
+        json.dump(js, f)
-        logger.info('Written {} to file {}'.format(tweetid, file))
+        logger.info('Written {} to file {}'.format(oid, file))
-def tweet_file(tweetid, folder):
+def id_file(oid, folder):
-    return os.path.join(folder, '%s.json' % tweetid)
+    return os.path.join(folder, '%s.json' % oid)
-def tweet_fail_file(tweetid, folder):
+def fail_file(oid, folder):
    failsfolder = os.path.join(folder, 'failed')
    if not os.path.exists(failsfolder):
        os.makedirs(failsfolder)
-    return os.path.join(failsfolder, '%s.failed' % tweetid)
+    return os.path.join(failsfolder, '%s.failed' % oid)
-def tweet_failed(tweetid, folder):
+def id_failed(oid, folder):
-    return os.path.isfile(tweet_fail_file(tweetid, folder))
+    return os.path.isfile(fail_file(oid, folder))
-def download_tweets(wq, tweetsfile, folder, update=False, retry_failed=False, ignore_fails=True):
+def tweet_download_batch(wq, batch):
    def filter_line(line):
        tweetid = int(line)
        # print('Checking {}'.format(tweetid))
        if (cached_tweet(tweetid, folder) and not update) or (tweet_failed(tweetid, folder) and not retry_failed):
            yield None
        else:
            yield line
    def print_result(res):
        tid, tweet = res
        if tweet:
            try:
                write_tweet_json(tweet, folder=folder)
                yield 1
            except Exception as ex:
                logger.error('%s: %s' % (tid, ex))
                if not ignore_fails:
                    raise
        else:
            logger.info('Tweet not recovered: {}'.format(tid))
            with open(tweet_fail_file(tid, folder), 'w') as f:
                print('Tweet not found', file=f)
            yield -1
    def download_batch(batch):
    tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id']
    return tweets.items()
-    with open(tweetsfile) as f:
+def user_download_batch(wq, batch):
-        lines = map(lambda x: x.strip(), f)
+    screen_names = []
-        lines_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_line, lines), desc='Total lines'))
+    user_ids = []
-        tweets = parallel(download_batch, lines_to_crawl, 100)
+    for elem in batch:
        try:
            user_ids.append(int(elem))
        except ValueError:
            screen_names.append(elem)
    print('Downloading: {} - {}'.format(user_ids, screen_names))
    users = wq.users.lookup(user_id=",".join(user_ids), screen_name=",".join(screen_names))
    found_ids = []
    found_names = []
    for user in users:
        uid = user['id']
        if uid in user_ids:
            found_ids.append(uid)
            yield (uid, user)
        uname = user['screen_name']
        if uname in screen_names:
            found_names.append(uname)
            yield (uname, user)
    for uid in set(user_ids) - set(found_ids):
        yield (uid, None)
    for name in set(screen_names) - set(found_names):
        yield (name, None)
 def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=True,
                  batch_method=tweet_download_batch):
    def filter_lines(line):
        # print('Checking {}'.format(line))
        oid = line[0]
        if (cached_id(oid, folder) and not update) or (id_failed(oid, folder) and not retry_failed):
            yield None
        else:
            yield str(oid)
    def print_result(res):
        for oid, obj in res:
          if obj:
              try:
                  write_json(obj, folder=folder, oid=oid)
                  yield 1
              except Exception as ex:
                  logger.error('%s: %s' % (oid, ex))
                  if not ignore_fails:
                      raise
          else:
              logger.info('Object not recovered: {}'.format(oid))
              with open(fail_file(oid, folder), 'w') as f:
                  print('Object not found', file=f)
              yield -1
    objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects'))
    batch_method = partial(batch_method, wq)
    tweets = parallel(batch_method, objects_to_crawl, 100)
    for res in tqdm(parallel(print_result, tweets), desc='Queried'):
-            pass
+        yield res
 def download_file(wq, csvfile, folder, column=0, delimiter=',',
                  header=False, quotechar='"', batch_method=tweet_download_batch,
                  **kwargs):
    with open(csvfile) as f:
        csvreader = csv.reader(f, delimiter=str(delimiter), quotechar=str(quotechar))
        if header:
            next(csvreader)
        tweets = map(lambda row: row[0].strip(), csvreader)
        for res in download_list(wq, tweets, folder, batch_method=batch_method,
                                 **kwargs):
            yield res
 def download_timeline(wq, user):
    return wq.statuses.user_timeline(id=user)
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -54,7 +54,7 @@ class TestUtils(TestCase):
        toc = time.time()
        assert (tic-toc) < 600
        resp2 = utils.parallel(echo, [1,2,3,4], chunksize=2)
-        assert list(resp2) == [1,2,3,4]
+        assert list(resp2) == [1,2, 3,4]
 class TestUtilsEnv(TestUtils):
@@ -68,5 +68,3 @@ class TestUtilsEnv(TestUtils):
    def tearDown(self):
        if hasattr(self, 'oldenv'):
            os.environ['BITTER_CONFIG'] = self.oldenv