Improve CLI. Add credentials

master
J. Fernando Sánchez 6 years ago
parent 6259013978
commit 5199d5b5aa

@ -32,9 +32,89 @@ bitter api statuses/user_timeline --id thepsf --count 500
```
## Adding credentials
```
bitter --config <YOUR CONFIGURATION FILE> credentials add
```
You can specify the parameters in the command or let the command line guide you through the process.
# Examples
The CLI can query the rest API:
## Downloading a list of tweets
Bitter can download tweets from a list of tweets in a CSV file.
The result is stored as individual json files in your folder of choice.
You can even specify the column number for tweet ids.
Bitter will not try to download
```
Usage: bitter tweet get_all [OPTIONS] TWEETSFILE
Download tweets from a list of tweets in a CSV file. The result is stored
as individual json files in your folder of choice.
Options:
-f, --folder TEXT
-d, --delimiter TEXT
-h, --header Discard the first line (use it as a header)
-q, --quotechar TEXT
-c, --column INTEGER
--help Show this message and exit.
```
For instance, this will download `tweet_ids.csv` in the `tweet_info` folder:
```
bitter tweet get_all -f tweet_info tweet_ids.csv
```
## Downloading a list of users
Bitter downloads users and tweets in a similar way:
```
Usage: bitter users get_all [OPTIONS] USERSFILE
Download users from a list of user ids/screen names in a CSV file. The
result is stored as individual json files in your folder of choice.
Options:
-f, --folder TEXT
-d, --delimiter TEXT
-h, --header Discard the first line (use it as a header)
-q, --quotechar TEXT
-c, --column INTEGER
--help Show this message and exit.
```
The only difference is that users can be downloaded via `screen_name` or `user_id`.
This method does not try to resolve screen names to user ids, so users may be downloaded more than once if they appear in both ways.
## Downloading a stream
```
Usage: bitter stream get [OPTIONS]
Options:
-l, --locations TEXT
-t, --track TEXT
-f, --file TEXT File to store the stream of tweets. Default: standard output
-p, --politelyretry Politely retry after a hangup/connection error
--help Show this message and exit.
```
```
bitter --config .bitter.yaml stream get
```
python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines
## REST queries
In newer versions of bitter, individual methods to download tweets/users using the REST API are being replaced with a generic method to call the API.
```
bitter api <URL endpoint> --parameter VALUE ... | [--tweets | --users] [--max_count MAX_COUNT] [--count COUNT_PER_CALL]

@ -1 +1 @@
0.8.0
0.9.0

@ -41,6 +41,32 @@ def main(ctx, verbose, logging_level, config, credentials):
if os.path.exists(utils.get_config_path(credentials)):
utils.copy_credentials_to_config(credentials, config)
@main.group()
@click.pass_context
def credentials(ctx):
pass
@credentials.command('add')
@click.option('--consumer_key', default=None)
@click.option('--consumer_secret', default=None)
@click.option('--token_key', default=None)
@click.option('--token_secret', default=None)
@click.argument('user_name')
def add(user_name, consumer_key, consumer_secret, token_key, token_secret):
if not consumer_key:
consumer_key = click.prompt('Please, enter your YOUR CONSUMER KEY')
if not consumer_secret:
consumer_secret = click.prompt('Please, enter your CONSUMER SECRET')
if not token_key:
token_key = click.prompt('Please, enter your ACCESS TOKEN')
if not token_secret:
token_secret = click.prompt('Please, enter your ACCESS TOKEN SECRET')
utils.add_credentials(conffile=bconf.CONFIG_FILE, user=user_name, consumer_key=consumer_key, consumer_secret=consumer_secret,
token_key=token_key, token_secret=token_secret)
click.echo('Credentials added for {}'.format(user_name))
@main.group()
@click.pass_context
def tweet(ctx):
@ -52,22 +78,36 @@ def tweet(ctx):
@click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False)
@click.argument('tweetid')
def get_tweet(tweetid, write, folder, update):
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
utils.download_tweet(wq, tweetid, write, folder, update)
@tweet.command('get_all')
@tweet.command('get_all', help='''Download tweets from a list of tweets in a CSV file.
The result is stored as individual json files in your folder of choice.''')
@click.argument('tweetsfile', 'File with a list of tweets to look up')
@click.option('-f', '--folder', default="tweets")
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
@click.option('-d', '--delimiter', default=",")
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
is_flag=True, default=False)
@click.option('-q', '--quotechar', default='"')
@click.option('-c', '--column', type=int, default=0)
@click.pass_context
def get_tweets(ctx, tweetsfile, folder):
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
utils.download_tweets(wq, tweetsfile, folder)
def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, column):
if update and not click.confirm('This may overwrite existing tweets. Continue?'):
click.echo('Cancelling')
return
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
batch_method=utils.tweet_download_batch,
header=header, quotechar=quotechar,
column=column, update=update):
pass
@tweet.command('search')
@click.argument('query')
@click.pass_context
def search(ctx, query):
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
t = utils.search_tweet(wq, query)
print(json.dumps(t, indent=2))
@ -75,7 +115,7 @@ def search(ctx, query):
@click.argument('user')
@click.pass_context
def timeline(ctx, user):
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
t = utils.user_timeline(wq, user)
print(json.dumps(t, indent=2))
@ -101,7 +141,7 @@ def list_users(ctx, db):
@click.option('-f', '--folder', default="users")
@click.option('-u', '--update', help="Update the file even if the user exists", is_flag=True, default=False)
def get_user(user, write, folder, update):
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
if not write:
u = utils.get_user(wq, user)
js = json.dumps(u, indent=2)
@ -118,15 +158,28 @@ def get_user(user, write, folder, update):
js = json.dumps(u, indent=2)
print(js, file=f)
@users.command('get_all')
@users.command('get_all', help='''Download users from a list of user ids/screen names in a CSV file.
The result is stored as individual json files in your folder of choice.''')
@click.argument('usersfile', 'File with a list of users to look up')
@click.option('-f', '--folder', default="users")
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
@click.option('-d', '--delimiter', default=",")
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
is_flag=True, default=False)
@click.option('-q', '--quotechar', default='"')
@click.option('-c', '--column', type=int, default=0)
@click.pass_context
def get_users(ctx, usersfile, folder):
with open(usersfile) as f:
for line in f:
uid = line.strip()
ctx.invoke(get_user, folder=folder, user=uid, write=True)
def get_users(ctx, usersfile, folder, update, delimiter, header, quotechar, column):
if update and not click.confirm('This may overwrite existing users. Continue?'):
click.echo('Cancelling')
return
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter,
batch_method=utils.user_download_batch,
update=update,
header=header, quotechar=quotechar,
column=column):
pass
@users.command('crawl')
@click.option('--db', required=True, help='Database to save all users.')
@ -147,7 +200,7 @@ def crawl_users(ctx, usersfile, skip, until, threads, db):
return ExitStack()
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
logger.info('Starting Network crawler with {} threads and {} credentials.'.format(threads,
len(wq.queue)))
@ -311,7 +364,7 @@ def users_extractor(ctx):
@click.pass_context
def extract(ctx, recursive, user, name, initfile):
print(locals())
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
dburi = ctx.obj['DBURI']
utils.extract(wq,
recursive=recursive,
@ -323,7 +376,7 @@ def extract(ctx, recursive, user, name, initfile):
@extractor.command('reset')
@click.pass_context
def reset_extractor(ctx):
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
db = ctx.obj['DBURI']
session = make_session(db)
session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})
@ -332,7 +385,7 @@ def reset_extractor(ctx):
@click.argument('url', required=False)
@click.pass_context
def get_limits(ctx, url):
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
total = {}
for worker in wq.queue:
resp = worker.client.application.rate_limit_status()
@ -357,7 +410,8 @@ def get_limits(ctx, url):
@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False))
@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False),
help='''Issue a call to an endpoint of the Twitter API.''')
@click.argument('cmd', nargs=1)
@click.option('--tweets', is_flag=True, help='Fetch more tweets using smart pagination. Use --count to control how many tweets to fetch per call, and --max_count to set the number of desired tweets (or -1 to get as many as possible).', type=bool, default=False)
@click.option('--users', is_flag=True, help='Fetch more users using smart pagination. Use --count to control how many users to fetch per call, and --max_count to set the number of desired users (or -1 to get as many as possible).', type=bool, default=False)
@ -374,7 +428,7 @@ def api(ctx, cmd, tweets, users, api_args):
if k in mappings:
k = mappings[k]
opts[k] = v
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
if tweets:
resp = utils.consume_tweets(wq[cmd], **opts)
elif users:
@ -409,7 +463,7 @@ def stream(ctx):
@click.option('-p', '--politelyretry', help='Politely retry after a hangup/connection error', is_flag=True, default=True)
@click.pass_context
def get_stream(ctx, locations, track, file, politelyretry):
wq = crawlers.StreamQueue.from_config(bconf.CONFIG_FILE, 1)
wq = crawlers.StreamQueue.from_config(conffile=bconf.CONFIG_FILE, max_workers=1)
query_args = {}
if locations:

@ -61,12 +61,14 @@ class FromCredentialsMixin(object):
class FromConfigMixin(object):
@classmethod
def from_config(cls, conffile=None, max_workers=None):
def from_config(cls, config=None, conffile=None, max_workers=None):
wq = cls()
with utils.config(conffile) as c:
for cred in islice(c['credentials'], max_workers):
wq.ready(cls.worker_class(cred["user"], cred))
if not config:
with utils.config(conffile) as c:
config = c
for cred in islice(config['credentials'], max_workers):
wq.ready(cls.worker_class(cred["user"], cred))
return wq
class TwitterWorker(object):

@ -4,6 +4,7 @@ import logging
import time
import json
import yaml
import csv
import io
import signal
@ -93,7 +94,7 @@ def read_config(conffile):
p = conffile and get_config_path(conffile)
if p:
if not os.path.exists(p):
raise Exception('{} file does not exist.'.format(p))
raise IOError('{} file does not exist.'.format(p))
f = open(p, 'r')
elif 'BITTER_CONFIG' not in os.environ:
raise Exception('No config file or BITTER_CONFIG env variable.')
@ -103,6 +104,8 @@ def read_config(conffile):
def write_config(conf, conffile=None):
if not conf:
conf = {'credentials': []}
if conffile:
p = get_config_path(conffile)
with open(p, 'w') as f:
@ -122,6 +125,7 @@ def create_config_file(conffile=None):
conffile = get_config_path(conffile)
with open(conffile, 'a'):
pass
write_config(None, conffile)
def get_credentials(conffile=None, inverse=False, **kwargs):
@ -142,7 +146,11 @@ def delete_credentials(conffile=None, **creds):
def add_credentials(conffile=None, **creds):
exist = get_credentials(conffile, **creds)
try:
exist = get_credentials(conffile, **creds)
except IOError:
exist = False
create_config_file(conffile)
if exist:
return
with config(conffile) as c:
@ -451,86 +459,128 @@ def get_user(c, user):
return c.users.lookup(screen_name=user)[0]
def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False):
cached = cached_tweet(tweetid, folder)
cached = cached_id(tweetid, folder)
tweet = None
if update or not cached:
tweet = get_tweet(wq, tweetid)
js = json.dumps(tweet, indent=2)
js = json.dumps(tweet)
if write:
if tweet:
write_tweet_json(js, folder)
write_json(js, folder)
else:
print(js)
def cached_tweet(tweetid, folder):
def cached_id(oid, folder):
tweet = None
file = os.path.join(folder, '%s.json' % tweetid)
file = os.path.join(folder, '%s.json' % oid)
if os.path.exists(file) and os.path.isfile(file):
try:
# print('%s: Tweet exists' % tweetid)
# print('%s: Object exists' % oid)
with open(file) as f:
tweet = json.load(f)
except Exception as ex:
logger.error('Error getting cached version of {}: {}'.format(tweetid, ex))
logger.error('Error getting cached version of {}: {}'.format(oid, ex))
return tweet
def write_tweet_json(js, folder):
tweetid = js['id']
file = tweet_file(tweetid, folder)
def write_json(js, folder, oid=None):
if not oid:
oid = js['id']
file = id_file(oid, folder)
if not os.path.exists(folder):
os.makedirs(folder)
with open(file, 'w') as f:
json.dump(js, f, indent=2)
logger.info('Written {} to file {}'.format(tweetid, file))
json.dump(js, f)
logger.info('Written {} to file {}'.format(oid, file))
def tweet_file(tweetid, folder):
return os.path.join(folder, '%s.json' % tweetid)
def id_file(oid, folder):
return os.path.join(folder, '%s.json' % oid)
def tweet_fail_file(tweetid, folder):
def fail_file(oid, folder):
failsfolder = os.path.join(folder, 'failed')
if not os.path.exists(failsfolder):
os.makedirs(failsfolder)
return os.path.join(failsfolder, '%s.failed' % tweetid)
return os.path.join(failsfolder, '%s.failed' % oid)
def tweet_failed(tweetid, folder):
return os.path.isfile(tweet_fail_file(tweetid, folder))
def id_failed(oid, folder):
return os.path.isfile(fail_file(oid, folder))
def download_tweets(wq, tweetsfile, folder, update=False, retry_failed=False, ignore_fails=True):
def filter_line(line):
tweetid = int(line)
# print('Checking {}'.format(tweetid))
if (cached_tweet(tweetid, folder) and not update) or (tweet_failed(tweetid, folder) and not retry_failed):
def tweet_download_batch(wq, batch):
tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id']
return tweets.items()
def user_download_batch(wq, batch):
screen_names = []
user_ids = []
for elem in batch:
try:
user_ids.append(int(elem))
except ValueError:
screen_names.append(elem)
print('Downloading: {} - {}'.format(user_ids, screen_names))
users = wq.users.lookup(user_id=",".join(user_ids), screen_name=",".join(screen_names))
found_ids = []
found_names = []
for user in users:
uid = user['id']
if uid in user_ids:
found_ids.append(uid)
yield (uid, user)
uname = user['screen_name']
if uname in screen_names:
found_names.append(uname)
yield (uname, user)
for uid in set(user_ids) - set(found_ids):
yield (uid, None)
for name in set(screen_names) - set(found_names):
yield (name, None)
def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=True,
batch_method=tweet_download_batch):
def filter_lines(line):
# print('Checking {}'.format(line))
oid = line[0]
if (cached_id(oid, folder) and not update) or (id_failed(oid, folder) and not retry_failed):
yield None
else:
yield line
yield str(oid)
def print_result(res):
tid, tweet = res
if tweet:
try:
write_tweet_json(tweet, folder=folder)
yield 1
except Exception as ex:
logger.error('%s: %s' % (tid, ex))
if not ignore_fails:
raise
else:
logger.info('Tweet not recovered: {}'.format(tid))
with open(tweet_fail_file(tid, folder), 'w') as f:
print('Tweet not found', file=f)
yield -1
def download_batch(batch):
tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id']
return tweets.items()
with open(tweetsfile) as f:
lines = map(lambda x: x.strip(), f)
lines_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_line, lines), desc='Total lines'))
tweets = parallel(download_batch, lines_to_crawl, 100)
for res in tqdm(parallel(print_result, tweets), desc='Queried'):
pass
for oid, obj in res:
if obj:
try:
write_json(obj, folder=folder, oid=oid)
yield 1
except Exception as ex:
logger.error('%s: %s' % (oid, ex))
if not ignore_fails:
raise
else:
logger.info('Object not recovered: {}'.format(oid))
with open(fail_file(oid, folder), 'w') as f:
print('Object not found', file=f)
yield -1
objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects'))
batch_method = partial(batch_method, wq)
tweets = parallel(batch_method, objects_to_crawl, 100)
for res in tqdm(parallel(print_result, tweets), desc='Queried'):
yield res
def download_file(wq, csvfile, folder, column=0, delimiter=',',
header=False, quotechar='"', batch_method=tweet_download_batch,
**kwargs):
with open(csvfile) as f:
csvreader = csv.reader(f, delimiter=str(delimiter), quotechar=str(quotechar))
if header:
next(csvreader)
tweets = map(lambda row: row[0].strip(), csvreader)
for res in download_list(wq, tweets, folder, batch_method=batch_method,
**kwargs):
yield res
def download_timeline(wq, user):
return wq.statuses.user_timeline(id=user)

@ -54,7 +54,7 @@ class TestUtils(TestCase):
toc = time.time()
assert (tic-toc) < 600
resp2 = utils.parallel(echo, [1,2,3,4], chunksize=2)
assert list(resp2) == [1,2,3,4]
assert list(resp2) == [1,2, 3,4]
class TestUtilsEnv(TestUtils):
@ -68,5 +68,3 @@ class TestUtilsEnv(TestUtils):
def tearDown(self):
if hasattr(self, 'oldenv'):
os.environ['BITTER_CONFIG'] = self.oldenv

Loading…
Cancel
Save