1
0
mirror of https://github.com/balkian/bitter.git synced 2024-12-22 00:18:12 +00:00

Improve CLI. Add credentials

This commit is contained in:
J. Fernando Sánchez 2018-03-20 13:29:18 +01:00
parent 6259013978
commit 5199d5b5aa
6 changed files with 264 additions and 80 deletions

View File

@ -32,9 +32,89 @@ bitter api statuses/user_timeline --id thepsf --count 500
``` ```
## Adding credentials
```
bitter --config <YOUR CONFIGURATION FILE> credentials add
```
You can specify the parameters in the command or let the command line guide you through the process.
# Examples # Examples
The CLI can query the rest API: ## Downloading a list of tweets
Bitter can download tweets from a list of tweets in a CSV file.
The result is stored as individual json files in your folder of choice.
You can even specify the column number for tweet ids.
Bitter will not try to download
```
Usage: bitter tweet get_all [OPTIONS] TWEETSFILE
Download tweets from a list of tweets in a CSV file. The result is stored
as individual json files in your folder of choice.
Options:
-f, --folder TEXT
-d, --delimiter TEXT
-h, --header Discard the first line (use it as a header)
-q, --quotechar TEXT
-c, --column INTEGER
--help Show this message and exit.
```
For instance, this will download `tweet_ids.csv` in the `tweet_info` folder:
```
bitter tweet get_all -f tweet_info tweet_ids.csv
```
## Downloading a list of users
Bitter downloads users and tweets in a similar way:
```
Usage: bitter users get_all [OPTIONS] USERSFILE
Download users from a list of user ids/screen names in a CSV file. The
result is stored as individual json files in your folder of choice.
Options:
-f, --folder TEXT
-d, --delimiter TEXT
-h, --header Discard the first line (use it as a header)
-q, --quotechar TEXT
-c, --column INTEGER
--help Show this message and exit.
```
The only difference is that users can be downloaded via `screen_name` or `user_id`.
This method does not try to resolve screen names to user ids, so users may be downloaded more than once if they appear in both ways.
## Downloading a stream
```
Usage: bitter stream get [OPTIONS]
Options:
-l, --locations TEXT
-t, --track TEXT
-f, --file TEXT File to store the stream of tweets. Default: standard output
-p, --politelyretry Politely retry after a hangup/connection error
--help Show this message and exit.
```
```
bitter --config .bitter.yaml stream get
```
python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines
## REST queries
In newer versions of bitter, individual methods to download tweets/users using the REST API are being replaced with a generic method to call the API.
``` ```
bitter api <URL endpoint> --parameter VALUE ... | [--tweets | --users] [--max_count MAX_COUNT] [--count COUNT_PER_CALL] bitter api <URL endpoint> --parameter VALUE ... | [--tweets | --users] [--max_count MAX_COUNT] [--count COUNT_PER_CALL]

View File

@ -1 +1 @@
0.8.0 0.9.0

View File

@ -41,6 +41,32 @@ def main(ctx, verbose, logging_level, config, credentials):
if os.path.exists(utils.get_config_path(credentials)): if os.path.exists(utils.get_config_path(credentials)):
utils.copy_credentials_to_config(credentials, config) utils.copy_credentials_to_config(credentials, config)
@main.group()
@click.pass_context
def credentials(ctx):
pass
@credentials.command('add')
@click.option('--consumer_key', default=None)
@click.option('--consumer_secret', default=None)
@click.option('--token_key', default=None)
@click.option('--token_secret', default=None)
@click.argument('user_name')
def add(user_name, consumer_key, consumer_secret, token_key, token_secret):
if not consumer_key:
consumer_key = click.prompt('Please, enter your YOUR CONSUMER KEY')
if not consumer_secret:
consumer_secret = click.prompt('Please, enter your CONSUMER SECRET')
if not token_key:
token_key = click.prompt('Please, enter your ACCESS TOKEN')
if not token_secret:
token_secret = click.prompt('Please, enter your ACCESS TOKEN SECRET')
utils.add_credentials(conffile=bconf.CONFIG_FILE, user=user_name, consumer_key=consumer_key, consumer_secret=consumer_secret,
token_key=token_key, token_secret=token_secret)
click.echo('Credentials added for {}'.format(user_name))
@main.group() @main.group()
@click.pass_context @click.pass_context
def tweet(ctx): def tweet(ctx):
@ -52,22 +78,36 @@ def tweet(ctx):
@click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False) @click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False)
@click.argument('tweetid') @click.argument('tweetid')
def get_tweet(tweetid, write, folder, update): def get_tweet(tweetid, write, folder, update):
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
utils.download_tweet(wq, tweetid, write, folder, update) utils.download_tweet(wq, tweetid, write, folder, update)
@tweet.command('get_all') @tweet.command('get_all', help='''Download tweets from a list of tweets in a CSV file.
The result is stored as individual json files in your folder of choice.''')
@click.argument('tweetsfile', 'File with a list of tweets to look up') @click.argument('tweetsfile', 'File with a list of tweets to look up')
@click.option('-f', '--folder', default="tweets") @click.option('-f', '--folder', default="tweets")
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
@click.option('-d', '--delimiter', default=",")
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
is_flag=True, default=False)
@click.option('-q', '--quotechar', default='"')
@click.option('-c', '--column', type=int, default=0)
@click.pass_context @click.pass_context
def get_tweets(ctx, tweetsfile, folder): def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, column):
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) if update and not click.confirm('This may overwrite existing tweets. Continue?'):
utils.download_tweets(wq, tweetsfile, folder) click.echo('Cancelling')
return
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
batch_method=utils.tweet_download_batch,
header=header, quotechar=quotechar,
column=column, update=update):
pass
@tweet.command('search') @tweet.command('search')
@click.argument('query') @click.argument('query')
@click.pass_context @click.pass_context
def search(ctx, query): def search(ctx, query):
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
t = utils.search_tweet(wq, query) t = utils.search_tweet(wq, query)
print(json.dumps(t, indent=2)) print(json.dumps(t, indent=2))
@ -75,7 +115,7 @@ def search(ctx, query):
@click.argument('user') @click.argument('user')
@click.pass_context @click.pass_context
def timeline(ctx, user): def timeline(ctx, user):
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
t = utils.user_timeline(wq, user) t = utils.user_timeline(wq, user)
print(json.dumps(t, indent=2)) print(json.dumps(t, indent=2))
@ -101,7 +141,7 @@ def list_users(ctx, db):
@click.option('-f', '--folder', default="users") @click.option('-f', '--folder', default="users")
@click.option('-u', '--update', help="Update the file even if the user exists", is_flag=True, default=False) @click.option('-u', '--update', help="Update the file even if the user exists", is_flag=True, default=False)
def get_user(user, write, folder, update): def get_user(user, write, folder, update):
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
if not write: if not write:
u = utils.get_user(wq, user) u = utils.get_user(wq, user)
js = json.dumps(u, indent=2) js = json.dumps(u, indent=2)
@ -118,15 +158,28 @@ def get_user(user, write, folder, update):
js = json.dumps(u, indent=2) js = json.dumps(u, indent=2)
print(js, file=f) print(js, file=f)
@users.command('get_all') @users.command('get_all', help='''Download users from a list of user ids/screen names in a CSV file.
The result is stored as individual json files in your folder of choice.''')
@click.argument('usersfile', 'File with a list of users to look up') @click.argument('usersfile', 'File with a list of users to look up')
@click.option('-f', '--folder', default="users") @click.option('-f', '--folder', default="users")
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
@click.option('-d', '--delimiter', default=",")
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
is_flag=True, default=False)
@click.option('-q', '--quotechar', default='"')
@click.option('-c', '--column', type=int, default=0)
@click.pass_context @click.pass_context
def get_users(ctx, usersfile, folder): def get_users(ctx, usersfile, folder, update, delimiter, header, quotechar, column):
with open(usersfile) as f: if update and not click.confirm('This may overwrite existing users. Continue?'):
for line in f: click.echo('Cancelling')
uid = line.strip() return
ctx.invoke(get_user, folder=folder, user=uid, write=True) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter,
batch_method=utils.user_download_batch,
update=update,
header=header, quotechar=quotechar,
column=column):
pass
@users.command('crawl') @users.command('crawl')
@click.option('--db', required=True, help='Database to save all users.') @click.option('--db', required=True, help='Database to save all users.')
@ -147,7 +200,7 @@ def crawl_users(ctx, usersfile, skip, until, threads, db):
return ExitStack() return ExitStack()
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
logger.info('Starting Network crawler with {} threads and {} credentials.'.format(threads, logger.info('Starting Network crawler with {} threads and {} credentials.'.format(threads,
len(wq.queue))) len(wq.queue)))
@ -311,7 +364,7 @@ def users_extractor(ctx):
@click.pass_context @click.pass_context
def extract(ctx, recursive, user, name, initfile): def extract(ctx, recursive, user, name, initfile):
print(locals()) print(locals())
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
dburi = ctx.obj['DBURI'] dburi = ctx.obj['DBURI']
utils.extract(wq, utils.extract(wq,
recursive=recursive, recursive=recursive,
@ -323,7 +376,7 @@ def extract(ctx, recursive, user, name, initfile):
@extractor.command('reset') @extractor.command('reset')
@click.pass_context @click.pass_context
def reset_extractor(ctx): def reset_extractor(ctx):
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
db = ctx.obj['DBURI'] db = ctx.obj['DBURI']
session = make_session(db) session = make_session(db)
session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False}) session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})
@ -332,7 +385,7 @@ def reset_extractor(ctx):
@click.argument('url', required=False) @click.argument('url', required=False)
@click.pass_context @click.pass_context
def get_limits(ctx, url): def get_limits(ctx, url):
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
total = {} total = {}
for worker in wq.queue: for worker in wq.queue:
resp = worker.client.application.rate_limit_status() resp = worker.client.application.rate_limit_status()
@ -357,7 +410,8 @@ def get_limits(ctx, url):
@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False)) @main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False),
help='''Issue a call to an endpoint of the Twitter API.''')
@click.argument('cmd', nargs=1) @click.argument('cmd', nargs=1)
@click.option('--tweets', is_flag=True, help='Fetch more tweets using smart pagination. Use --count to control how many tweets to fetch per call, and --max_count to set the number of desired tweets (or -1 to get as many as possible).', type=bool, default=False) @click.option('--tweets', is_flag=True, help='Fetch more tweets using smart pagination. Use --count to control how many tweets to fetch per call, and --max_count to set the number of desired tweets (or -1 to get as many as possible).', type=bool, default=False)
@click.option('--users', is_flag=True, help='Fetch more users using smart pagination. Use --count to control how many users to fetch per call, and --max_count to set the number of desired users (or -1 to get as many as possible).', type=bool, default=False) @click.option('--users', is_flag=True, help='Fetch more users using smart pagination. Use --count to control how many users to fetch per call, and --max_count to set the number of desired users (or -1 to get as many as possible).', type=bool, default=False)
@ -374,7 +428,7 @@ def api(ctx, cmd, tweets, users, api_args):
if k in mappings: if k in mappings:
k = mappings[k] k = mappings[k]
opts[k] = v opts[k] = v
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
if tweets: if tweets:
resp = utils.consume_tweets(wq[cmd], **opts) resp = utils.consume_tweets(wq[cmd], **opts)
elif users: elif users:
@ -409,7 +463,7 @@ def stream(ctx):
@click.option('-p', '--politelyretry', help='Politely retry after a hangup/connection error', is_flag=True, default=True) @click.option('-p', '--politelyretry', help='Politely retry after a hangup/connection error', is_flag=True, default=True)
@click.pass_context @click.pass_context
def get_stream(ctx, locations, track, file, politelyretry): def get_stream(ctx, locations, track, file, politelyretry):
wq = crawlers.StreamQueue.from_config(bconf.CONFIG_FILE, 1) wq = crawlers.StreamQueue.from_config(conffile=bconf.CONFIG_FILE, max_workers=1)
query_args = {} query_args = {}
if locations: if locations:

View File

@ -61,12 +61,14 @@ class FromCredentialsMixin(object):
class FromConfigMixin(object): class FromConfigMixin(object):
@classmethod @classmethod
def from_config(cls, conffile=None, max_workers=None): def from_config(cls, config=None, conffile=None, max_workers=None):
wq = cls() wq = cls()
with utils.config(conffile) as c: if not config:
for cred in islice(c['credentials'], max_workers): with utils.config(conffile) as c:
wq.ready(cls.worker_class(cred["user"], cred)) config = c
for cred in islice(config['credentials'], max_workers):
wq.ready(cls.worker_class(cred["user"], cred))
return wq return wq
class TwitterWorker(object): class TwitterWorker(object):

View File

@ -4,6 +4,7 @@ import logging
import time import time
import json import json
import yaml import yaml
import csv
import io import io
import signal import signal
@ -93,7 +94,7 @@ def read_config(conffile):
p = conffile and get_config_path(conffile) p = conffile and get_config_path(conffile)
if p: if p:
if not os.path.exists(p): if not os.path.exists(p):
raise Exception('{} file does not exist.'.format(p)) raise IOError('{} file does not exist.'.format(p))
f = open(p, 'r') f = open(p, 'r')
elif 'BITTER_CONFIG' not in os.environ: elif 'BITTER_CONFIG' not in os.environ:
raise Exception('No config file or BITTER_CONFIG env variable.') raise Exception('No config file or BITTER_CONFIG env variable.')
@ -103,6 +104,8 @@ def read_config(conffile):
def write_config(conf, conffile=None): def write_config(conf, conffile=None):
if not conf:
conf = {'credentials': []}
if conffile: if conffile:
p = get_config_path(conffile) p = get_config_path(conffile)
with open(p, 'w') as f: with open(p, 'w') as f:
@ -122,6 +125,7 @@ def create_config_file(conffile=None):
conffile = get_config_path(conffile) conffile = get_config_path(conffile)
with open(conffile, 'a'): with open(conffile, 'a'):
pass pass
write_config(None, conffile)
def get_credentials(conffile=None, inverse=False, **kwargs): def get_credentials(conffile=None, inverse=False, **kwargs):
@ -142,7 +146,11 @@ def delete_credentials(conffile=None, **creds):
def add_credentials(conffile=None, **creds): def add_credentials(conffile=None, **creds):
exist = get_credentials(conffile, **creds) try:
exist = get_credentials(conffile, **creds)
except IOError:
exist = False
create_config_file(conffile)
if exist: if exist:
return return
with config(conffile) as c: with config(conffile) as c:
@ -451,86 +459,128 @@ def get_user(c, user):
return c.users.lookup(screen_name=user)[0] return c.users.lookup(screen_name=user)[0]
def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False): def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False):
cached = cached_tweet(tweetid, folder) cached = cached_id(tweetid, folder)
tweet = None tweet = None
if update or not cached: if update or not cached:
tweet = get_tweet(wq, tweetid) tweet = get_tweet(wq, tweetid)
js = json.dumps(tweet, indent=2) js = json.dumps(tweet)
if write: if write:
if tweet: if tweet:
write_tweet_json(js, folder) write_json(js, folder)
else: else:
print(js) print(js)
def cached_tweet(tweetid, folder): def cached_id(oid, folder):
tweet = None tweet = None
file = os.path.join(folder, '%s.json' % tweetid) file = os.path.join(folder, '%s.json' % oid)
if os.path.exists(file) and os.path.isfile(file): if os.path.exists(file) and os.path.isfile(file):
try: try:
# print('%s: Tweet exists' % tweetid) # print('%s: Object exists' % oid)
with open(file) as f: with open(file) as f:
tweet = json.load(f) tweet = json.load(f)
except Exception as ex: except Exception as ex:
logger.error('Error getting cached version of {}: {}'.format(tweetid, ex)) logger.error('Error getting cached version of {}: {}'.format(oid, ex))
return tweet return tweet
def write_tweet_json(js, folder): def write_json(js, folder, oid=None):
tweetid = js['id'] if not oid:
file = tweet_file(tweetid, folder) oid = js['id']
file = id_file(oid, folder)
if not os.path.exists(folder): if not os.path.exists(folder):
os.makedirs(folder) os.makedirs(folder)
with open(file, 'w') as f: with open(file, 'w') as f:
json.dump(js, f, indent=2) json.dump(js, f)
logger.info('Written {} to file {}'.format(tweetid, file)) logger.info('Written {} to file {}'.format(oid, file))
def tweet_file(tweetid, folder): def id_file(oid, folder):
return os.path.join(folder, '%s.json' % tweetid) return os.path.join(folder, '%s.json' % oid)
def tweet_fail_file(tweetid, folder): def fail_file(oid, folder):
failsfolder = os.path.join(folder, 'failed') failsfolder = os.path.join(folder, 'failed')
if not os.path.exists(failsfolder): if not os.path.exists(failsfolder):
os.makedirs(failsfolder) os.makedirs(failsfolder)
return os.path.join(failsfolder, '%s.failed' % tweetid) return os.path.join(failsfolder, '%s.failed' % oid)
def tweet_failed(tweetid, folder): def id_failed(oid, folder):
return os.path.isfile(tweet_fail_file(tweetid, folder)) return os.path.isfile(fail_file(oid, folder))
def download_tweets(wq, tweetsfile, folder, update=False, retry_failed=False, ignore_fails=True): def tweet_download_batch(wq, batch):
def filter_line(line): tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id']
tweetid = int(line) return tweets.items()
# print('Checking {}'.format(tweetid))
if (cached_tweet(tweetid, folder) and not update) or (tweet_failed(tweetid, folder) and not retry_failed): def user_download_batch(wq, batch):
screen_names = []
user_ids = []
for elem in batch:
try:
user_ids.append(int(elem))
except ValueError:
screen_names.append(elem)
print('Downloading: {} - {}'.format(user_ids, screen_names))
users = wq.users.lookup(user_id=",".join(user_ids), screen_name=",".join(screen_names))
found_ids = []
found_names = []
for user in users:
uid = user['id']
if uid in user_ids:
found_ids.append(uid)
yield (uid, user)
uname = user['screen_name']
if uname in screen_names:
found_names.append(uname)
yield (uname, user)
for uid in set(user_ids) - set(found_ids):
yield (uid, None)
for name in set(screen_names) - set(found_names):
yield (name, None)
def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=True,
batch_method=tweet_download_batch):
def filter_lines(line):
# print('Checking {}'.format(line))
oid = line[0]
if (cached_id(oid, folder) and not update) or (id_failed(oid, folder) and not retry_failed):
yield None yield None
else: else:
yield line yield str(oid)
def print_result(res): def print_result(res):
tid, tweet = res for oid, obj in res:
if tweet: if obj:
try: try:
write_tweet_json(tweet, folder=folder) write_json(obj, folder=folder, oid=oid)
yield 1 yield 1
except Exception as ex: except Exception as ex:
logger.error('%s: %s' % (tid, ex)) logger.error('%s: %s' % (oid, ex))
if not ignore_fails: if not ignore_fails:
raise raise
else: else:
logger.info('Tweet not recovered: {}'.format(tid)) logger.info('Object not recovered: {}'.format(oid))
with open(tweet_fail_file(tid, folder), 'w') as f: with open(fail_file(oid, folder), 'w') as f:
print('Tweet not found', file=f) print('Object not found', file=f)
yield -1 yield -1
def download_batch(batch): objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects'))
tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id'] batch_method = partial(batch_method, wq)
return tweets.items() tweets = parallel(batch_method, objects_to_crawl, 100)
for res in tqdm(parallel(print_result, tweets), desc='Queried'):
yield res
def download_file(wq, csvfile, folder, column=0, delimiter=',',
header=False, quotechar='"', batch_method=tweet_download_batch,
**kwargs):
with open(csvfile) as f:
csvreader = csv.reader(f, delimiter=str(delimiter), quotechar=str(quotechar))
if header:
next(csvreader)
tweets = map(lambda row: row[0].strip(), csvreader)
for res in download_list(wq, tweets, folder, batch_method=batch_method,
**kwargs):
yield res
with open(tweetsfile) as f:
lines = map(lambda x: x.strip(), f)
lines_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_line, lines), desc='Total lines'))
tweets = parallel(download_batch, lines_to_crawl, 100)
for res in tqdm(parallel(print_result, tweets), desc='Queried'):
pass
def download_timeline(wq, user): def download_timeline(wq, user):
return wq.statuses.user_timeline(id=user) return wq.statuses.user_timeline(id=user)

View File

@ -54,7 +54,7 @@ class TestUtils(TestCase):
toc = time.time() toc = time.time()
assert (tic-toc) < 600 assert (tic-toc) < 600
resp2 = utils.parallel(echo, [1,2,3,4], chunksize=2) resp2 = utils.parallel(echo, [1,2,3,4], chunksize=2)
assert list(resp2) == [1,2,3,4] assert list(resp2) == [1,2, 3,4]
class TestUtilsEnv(TestUtils): class TestUtilsEnv(TestUtils):
@ -68,5 +68,3 @@ class TestUtilsEnv(TestUtils):
def tearDown(self): def tearDown(self):
if hasattr(self, 'oldenv'): if hasattr(self, 'oldenv'):
os.environ['BITTER_CONFIG'] = self.oldenv os.environ['BITTER_CONFIG'] = self.oldenv