mirror of
https://github.com/balkian/bitter.git
synced 2024-12-22 00:18:12 +00:00
Improve CLI. Add credentials
This commit is contained in:
parent
6259013978
commit
5199d5b5aa
82
README.md
82
README.md
@ -32,9 +32,89 @@ bitter api statuses/user_timeline --id thepsf --count 500
|
|||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Adding credentials
|
||||||
|
|
||||||
|
```
|
||||||
|
bitter --config <YOUR CONFIGURATION FILE> credentials add
|
||||||
|
```
|
||||||
|
|
||||||
|
You can specify the parameters in the command or let the command line guide you through the process.
|
||||||
|
|
||||||
# Examples
|
# Examples
|
||||||
|
|
||||||
The CLI can query the rest API:
|
## Downloading a list of tweets
|
||||||
|
|
||||||
|
Bitter can download tweets from a list of tweets in a CSV file.
|
||||||
|
The result is stored as individual json files in your folder of choice.
|
||||||
|
You can even specify the column number for tweet ids.
|
||||||
|
Bitter will not try to download
|
||||||
|
|
||||||
|
```
|
||||||
|
Usage: bitter tweet get_all [OPTIONS] TWEETSFILE
|
||||||
|
|
||||||
|
Download tweets from a list of tweets in a CSV file. The result is stored
|
||||||
|
as individual json files in your folder of choice.
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-f, --folder TEXT
|
||||||
|
-d, --delimiter TEXT
|
||||||
|
-h, --header Discard the first line (use it as a header)
|
||||||
|
-q, --quotechar TEXT
|
||||||
|
-c, --column INTEGER
|
||||||
|
--help Show this message and exit.
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
For instance, this will download `tweet_ids.csv` in the `tweet_info` folder:
|
||||||
|
|
||||||
|
```
|
||||||
|
bitter tweet get_all -f tweet_info tweet_ids.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
## Downloading a list of users
|
||||||
|
|
||||||
|
Bitter downloads users and tweets in a similar way:
|
||||||
|
|
||||||
|
```
|
||||||
|
Usage: bitter users get_all [OPTIONS] USERSFILE
|
||||||
|
|
||||||
|
Download users from a list of user ids/screen names in a CSV file. The
|
||||||
|
result is stored as individual json files in your folder of choice.
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-f, --folder TEXT
|
||||||
|
-d, --delimiter TEXT
|
||||||
|
-h, --header Discard the first line (use it as a header)
|
||||||
|
-q, --quotechar TEXT
|
||||||
|
-c, --column INTEGER
|
||||||
|
--help Show this message and exit.
|
||||||
|
```
|
||||||
|
|
||||||
|
The only difference is that users can be downloaded via `screen_name` or `user_id`.
|
||||||
|
This method does not try to resolve screen names to user ids, so users may be downloaded more than once if they appear in both ways.
|
||||||
|
|
||||||
|
## Downloading a stream
|
||||||
|
|
||||||
|
```
|
||||||
|
Usage: bitter stream get [OPTIONS]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-l, --locations TEXT
|
||||||
|
-t, --track TEXT
|
||||||
|
-f, --file TEXT File to store the stream of tweets. Default: standard output
|
||||||
|
-p, --politelyretry Politely retry after a hangup/connection error
|
||||||
|
--help Show this message and exit.
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
bitter --config .bitter.yaml stream get
|
||||||
|
```
|
||||||
|
python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines
|
||||||
|
|
||||||
|
|
||||||
|
## REST queries
|
||||||
|
|
||||||
|
In newer versions of bitter, individual methods to download tweets/users using the REST API are being replaced with a generic method to call the API.
|
||||||
|
|
||||||
```
|
```
|
||||||
bitter api <URL endpoint> --parameter VALUE ... | [--tweets | --users] [--max_count MAX_COUNT] [--count COUNT_PER_CALL]
|
bitter api <URL endpoint> --parameter VALUE ... | [--tweets | --users] [--max_count MAX_COUNT] [--count COUNT_PER_CALL]
|
||||||
|
@ -1 +1 @@
|
|||||||
0.8.0
|
0.9.0
|
||||||
|
@ -41,6 +41,32 @@ def main(ctx, verbose, logging_level, config, credentials):
|
|||||||
if os.path.exists(utils.get_config_path(credentials)):
|
if os.path.exists(utils.get_config_path(credentials)):
|
||||||
utils.copy_credentials_to_config(credentials, config)
|
utils.copy_credentials_to_config(credentials, config)
|
||||||
|
|
||||||
|
|
||||||
|
@main.group()
|
||||||
|
@click.pass_context
|
||||||
|
def credentials(ctx):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@credentials.command('add')
|
||||||
|
@click.option('--consumer_key', default=None)
|
||||||
|
@click.option('--consumer_secret', default=None)
|
||||||
|
@click.option('--token_key', default=None)
|
||||||
|
@click.option('--token_secret', default=None)
|
||||||
|
@click.argument('user_name')
|
||||||
|
def add(user_name, consumer_key, consumer_secret, token_key, token_secret):
|
||||||
|
if not consumer_key:
|
||||||
|
consumer_key = click.prompt('Please, enter your YOUR CONSUMER KEY')
|
||||||
|
if not consumer_secret:
|
||||||
|
consumer_secret = click.prompt('Please, enter your CONSUMER SECRET')
|
||||||
|
if not token_key:
|
||||||
|
token_key = click.prompt('Please, enter your ACCESS TOKEN')
|
||||||
|
if not token_secret:
|
||||||
|
token_secret = click.prompt('Please, enter your ACCESS TOKEN SECRET')
|
||||||
|
utils.add_credentials(conffile=bconf.CONFIG_FILE, user=user_name, consumer_key=consumer_key, consumer_secret=consumer_secret,
|
||||||
|
token_key=token_key, token_secret=token_secret)
|
||||||
|
click.echo('Credentials added for {}'.format(user_name))
|
||||||
|
|
||||||
|
|
||||||
@main.group()
|
@main.group()
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def tweet(ctx):
|
def tweet(ctx):
|
||||||
@ -52,22 +78,36 @@ def tweet(ctx):
|
|||||||
@click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False)
|
@click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False)
|
||||||
@click.argument('tweetid')
|
@click.argument('tweetid')
|
||||||
def get_tweet(tweetid, write, folder, update):
|
def get_tweet(tweetid, write, folder, update):
|
||||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
utils.download_tweet(wq, tweetid, write, folder, update)
|
utils.download_tweet(wq, tweetid, write, folder, update)
|
||||||
|
|
||||||
@tweet.command('get_all')
|
@tweet.command('get_all', help='''Download tweets from a list of tweets in a CSV file.
|
||||||
|
The result is stored as individual json files in your folder of choice.''')
|
||||||
@click.argument('tweetsfile', 'File with a list of tweets to look up')
|
@click.argument('tweetsfile', 'File with a list of tweets to look up')
|
||||||
@click.option('-f', '--folder', default="tweets")
|
@click.option('-f', '--folder', default="tweets")
|
||||||
|
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
|
||||||
|
@click.option('-d', '--delimiter', default=",")
|
||||||
|
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
|
||||||
|
is_flag=True, default=False)
|
||||||
|
@click.option('-q', '--quotechar', default='"')
|
||||||
|
@click.option('-c', '--column', type=int, default=0)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def get_tweets(ctx, tweetsfile, folder):
|
def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, column):
|
||||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
if update and not click.confirm('This may overwrite existing tweets. Continue?'):
|
||||||
utils.download_tweets(wq, tweetsfile, folder)
|
click.echo('Cancelling')
|
||||||
|
return
|
||||||
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
|
for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
|
||||||
|
batch_method=utils.tweet_download_batch,
|
||||||
|
header=header, quotechar=quotechar,
|
||||||
|
column=column, update=update):
|
||||||
|
pass
|
||||||
|
|
||||||
@tweet.command('search')
|
@tweet.command('search')
|
||||||
@click.argument('query')
|
@click.argument('query')
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def search(ctx, query):
|
def search(ctx, query):
|
||||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
t = utils.search_tweet(wq, query)
|
t = utils.search_tweet(wq, query)
|
||||||
print(json.dumps(t, indent=2))
|
print(json.dumps(t, indent=2))
|
||||||
|
|
||||||
@ -75,7 +115,7 @@ def search(ctx, query):
|
|||||||
@click.argument('user')
|
@click.argument('user')
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def timeline(ctx, user):
|
def timeline(ctx, user):
|
||||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
t = utils.user_timeline(wq, user)
|
t = utils.user_timeline(wq, user)
|
||||||
print(json.dumps(t, indent=2))
|
print(json.dumps(t, indent=2))
|
||||||
|
|
||||||
@ -101,7 +141,7 @@ def list_users(ctx, db):
|
|||||||
@click.option('-f', '--folder', default="users")
|
@click.option('-f', '--folder', default="users")
|
||||||
@click.option('-u', '--update', help="Update the file even if the user exists", is_flag=True, default=False)
|
@click.option('-u', '--update', help="Update the file even if the user exists", is_flag=True, default=False)
|
||||||
def get_user(user, write, folder, update):
|
def get_user(user, write, folder, update):
|
||||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
if not write:
|
if not write:
|
||||||
u = utils.get_user(wq, user)
|
u = utils.get_user(wq, user)
|
||||||
js = json.dumps(u, indent=2)
|
js = json.dumps(u, indent=2)
|
||||||
@ -118,15 +158,28 @@ def get_user(user, write, folder, update):
|
|||||||
js = json.dumps(u, indent=2)
|
js = json.dumps(u, indent=2)
|
||||||
print(js, file=f)
|
print(js, file=f)
|
||||||
|
|
||||||
@users.command('get_all')
|
@users.command('get_all', help='''Download users from a list of user ids/screen names in a CSV file.
|
||||||
|
The result is stored as individual json files in your folder of choice.''')
|
||||||
@click.argument('usersfile', 'File with a list of users to look up')
|
@click.argument('usersfile', 'File with a list of users to look up')
|
||||||
@click.option('-f', '--folder', default="users")
|
@click.option('-f', '--folder', default="users")
|
||||||
|
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
|
||||||
|
@click.option('-d', '--delimiter', default=",")
|
||||||
|
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
|
||||||
|
is_flag=True, default=False)
|
||||||
|
@click.option('-q', '--quotechar', default='"')
|
||||||
|
@click.option('-c', '--column', type=int, default=0)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def get_users(ctx, usersfile, folder):
|
def get_users(ctx, usersfile, folder, update, delimiter, header, quotechar, column):
|
||||||
with open(usersfile) as f:
|
if update and not click.confirm('This may overwrite existing users. Continue?'):
|
||||||
for line in f:
|
click.echo('Cancelling')
|
||||||
uid = line.strip()
|
return
|
||||||
ctx.invoke(get_user, folder=folder, user=uid, write=True)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
|
for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter,
|
||||||
|
batch_method=utils.user_download_batch,
|
||||||
|
update=update,
|
||||||
|
header=header, quotechar=quotechar,
|
||||||
|
column=column):
|
||||||
|
pass
|
||||||
|
|
||||||
@users.command('crawl')
|
@users.command('crawl')
|
||||||
@click.option('--db', required=True, help='Database to save all users.')
|
@click.option('--db', required=True, help='Database to save all users.')
|
||||||
@ -147,7 +200,7 @@ def crawl_users(ctx, usersfile, skip, until, threads, db):
|
|||||||
return ExitStack()
|
return ExitStack()
|
||||||
|
|
||||||
|
|
||||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
logger.info('Starting Network crawler with {} threads and {} credentials.'.format(threads,
|
logger.info('Starting Network crawler with {} threads and {} credentials.'.format(threads,
|
||||||
len(wq.queue)))
|
len(wq.queue)))
|
||||||
|
|
||||||
@ -311,7 +364,7 @@ def users_extractor(ctx):
|
|||||||
@click.pass_context
|
@click.pass_context
|
||||||
def extract(ctx, recursive, user, name, initfile):
|
def extract(ctx, recursive, user, name, initfile):
|
||||||
print(locals())
|
print(locals())
|
||||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
dburi = ctx.obj['DBURI']
|
dburi = ctx.obj['DBURI']
|
||||||
utils.extract(wq,
|
utils.extract(wq,
|
||||||
recursive=recursive,
|
recursive=recursive,
|
||||||
@ -323,7 +376,7 @@ def extract(ctx, recursive, user, name, initfile):
|
|||||||
@extractor.command('reset')
|
@extractor.command('reset')
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def reset_extractor(ctx):
|
def reset_extractor(ctx):
|
||||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
db = ctx.obj['DBURI']
|
db = ctx.obj['DBURI']
|
||||||
session = make_session(db)
|
session = make_session(db)
|
||||||
session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})
|
session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})
|
||||||
@ -332,7 +385,7 @@ def reset_extractor(ctx):
|
|||||||
@click.argument('url', required=False)
|
@click.argument('url', required=False)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def get_limits(ctx, url):
|
def get_limits(ctx, url):
|
||||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
total = {}
|
total = {}
|
||||||
for worker in wq.queue:
|
for worker in wq.queue:
|
||||||
resp = worker.client.application.rate_limit_status()
|
resp = worker.client.application.rate_limit_status()
|
||||||
@ -357,7 +410,8 @@ def get_limits(ctx, url):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False))
|
@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False),
|
||||||
|
help='''Issue a call to an endpoint of the Twitter API.''')
|
||||||
@click.argument('cmd', nargs=1)
|
@click.argument('cmd', nargs=1)
|
||||||
@click.option('--tweets', is_flag=True, help='Fetch more tweets using smart pagination. Use --count to control how many tweets to fetch per call, and --max_count to set the number of desired tweets (or -1 to get as many as possible).', type=bool, default=False)
|
@click.option('--tweets', is_flag=True, help='Fetch more tweets using smart pagination. Use --count to control how many tweets to fetch per call, and --max_count to set the number of desired tweets (or -1 to get as many as possible).', type=bool, default=False)
|
||||||
@click.option('--users', is_flag=True, help='Fetch more users using smart pagination. Use --count to control how many users to fetch per call, and --max_count to set the number of desired users (or -1 to get as many as possible).', type=bool, default=False)
|
@click.option('--users', is_flag=True, help='Fetch more users using smart pagination. Use --count to control how many users to fetch per call, and --max_count to set the number of desired users (or -1 to get as many as possible).', type=bool, default=False)
|
||||||
@ -374,7 +428,7 @@ def api(ctx, cmd, tweets, users, api_args):
|
|||||||
if k in mappings:
|
if k in mappings:
|
||||||
k = mappings[k]
|
k = mappings[k]
|
||||||
opts[k] = v
|
opts[k] = v
|
||||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
if tweets:
|
if tweets:
|
||||||
resp = utils.consume_tweets(wq[cmd], **opts)
|
resp = utils.consume_tweets(wq[cmd], **opts)
|
||||||
elif users:
|
elif users:
|
||||||
@ -409,7 +463,7 @@ def stream(ctx):
|
|||||||
@click.option('-p', '--politelyretry', help='Politely retry after a hangup/connection error', is_flag=True, default=True)
|
@click.option('-p', '--politelyretry', help='Politely retry after a hangup/connection error', is_flag=True, default=True)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def get_stream(ctx, locations, track, file, politelyretry):
|
def get_stream(ctx, locations, track, file, politelyretry):
|
||||||
wq = crawlers.StreamQueue.from_config(bconf.CONFIG_FILE, 1)
|
wq = crawlers.StreamQueue.from_config(conffile=bconf.CONFIG_FILE, max_workers=1)
|
||||||
|
|
||||||
query_args = {}
|
query_args = {}
|
||||||
if locations:
|
if locations:
|
||||||
|
@ -61,11 +61,13 @@ class FromCredentialsMixin(object):
|
|||||||
class FromConfigMixin(object):
|
class FromConfigMixin(object):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_config(cls, conffile=None, max_workers=None):
|
def from_config(cls, config=None, conffile=None, max_workers=None):
|
||||||
wq = cls()
|
wq = cls()
|
||||||
|
|
||||||
|
if not config:
|
||||||
with utils.config(conffile) as c:
|
with utils.config(conffile) as c:
|
||||||
for cred in islice(c['credentials'], max_workers):
|
config = c
|
||||||
|
for cred in islice(config['credentials'], max_workers):
|
||||||
wq.ready(cls.worker_class(cred["user"], cred))
|
wq.ready(cls.worker_class(cred["user"], cred))
|
||||||
return wq
|
return wq
|
||||||
|
|
||||||
|
150
bitter/utils.py
150
bitter/utils.py
@ -4,6 +4,7 @@ import logging
|
|||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
import yaml
|
import yaml
|
||||||
|
import csv
|
||||||
import io
|
import io
|
||||||
|
|
||||||
import signal
|
import signal
|
||||||
@ -93,7 +94,7 @@ def read_config(conffile):
|
|||||||
p = conffile and get_config_path(conffile)
|
p = conffile and get_config_path(conffile)
|
||||||
if p:
|
if p:
|
||||||
if not os.path.exists(p):
|
if not os.path.exists(p):
|
||||||
raise Exception('{} file does not exist.'.format(p))
|
raise IOError('{} file does not exist.'.format(p))
|
||||||
f = open(p, 'r')
|
f = open(p, 'r')
|
||||||
elif 'BITTER_CONFIG' not in os.environ:
|
elif 'BITTER_CONFIG' not in os.environ:
|
||||||
raise Exception('No config file or BITTER_CONFIG env variable.')
|
raise Exception('No config file or BITTER_CONFIG env variable.')
|
||||||
@ -103,6 +104,8 @@ def read_config(conffile):
|
|||||||
|
|
||||||
|
|
||||||
def write_config(conf, conffile=None):
|
def write_config(conf, conffile=None):
|
||||||
|
if not conf:
|
||||||
|
conf = {'credentials': []}
|
||||||
if conffile:
|
if conffile:
|
||||||
p = get_config_path(conffile)
|
p = get_config_path(conffile)
|
||||||
with open(p, 'w') as f:
|
with open(p, 'w') as f:
|
||||||
@ -122,6 +125,7 @@ def create_config_file(conffile=None):
|
|||||||
conffile = get_config_path(conffile)
|
conffile = get_config_path(conffile)
|
||||||
with open(conffile, 'a'):
|
with open(conffile, 'a'):
|
||||||
pass
|
pass
|
||||||
|
write_config(None, conffile)
|
||||||
|
|
||||||
|
|
||||||
def get_credentials(conffile=None, inverse=False, **kwargs):
|
def get_credentials(conffile=None, inverse=False, **kwargs):
|
||||||
@ -142,7 +146,11 @@ def delete_credentials(conffile=None, **creds):
|
|||||||
|
|
||||||
|
|
||||||
def add_credentials(conffile=None, **creds):
|
def add_credentials(conffile=None, **creds):
|
||||||
|
try:
|
||||||
exist = get_credentials(conffile, **creds)
|
exist = get_credentials(conffile, **creds)
|
||||||
|
except IOError:
|
||||||
|
exist = False
|
||||||
|
create_config_file(conffile)
|
||||||
if exist:
|
if exist:
|
||||||
return
|
return
|
||||||
with config(conffile) as c:
|
with config(conffile) as c:
|
||||||
@ -451,86 +459,128 @@ def get_user(c, user):
|
|||||||
return c.users.lookup(screen_name=user)[0]
|
return c.users.lookup(screen_name=user)[0]
|
||||||
|
|
||||||
def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False):
|
def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False):
|
||||||
cached = cached_tweet(tweetid, folder)
|
cached = cached_id(tweetid, folder)
|
||||||
tweet = None
|
tweet = None
|
||||||
if update or not cached:
|
if update or not cached:
|
||||||
tweet = get_tweet(wq, tweetid)
|
tweet = get_tweet(wq, tweetid)
|
||||||
js = json.dumps(tweet, indent=2)
|
js = json.dumps(tweet)
|
||||||
if write:
|
if write:
|
||||||
if tweet:
|
if tweet:
|
||||||
write_tweet_json(js, folder)
|
write_json(js, folder)
|
||||||
else:
|
else:
|
||||||
print(js)
|
print(js)
|
||||||
|
|
||||||
|
|
||||||
def cached_tweet(tweetid, folder):
|
def cached_id(oid, folder):
|
||||||
tweet = None
|
tweet = None
|
||||||
file = os.path.join(folder, '%s.json' % tweetid)
|
file = os.path.join(folder, '%s.json' % oid)
|
||||||
if os.path.exists(file) and os.path.isfile(file):
|
if os.path.exists(file) and os.path.isfile(file):
|
||||||
try:
|
try:
|
||||||
# print('%s: Tweet exists' % tweetid)
|
# print('%s: Object exists' % oid)
|
||||||
with open(file) as f:
|
with open(file) as f:
|
||||||
tweet = json.load(f)
|
tweet = json.load(f)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
logger.error('Error getting cached version of {}: {}'.format(tweetid, ex))
|
logger.error('Error getting cached version of {}: {}'.format(oid, ex))
|
||||||
return tweet
|
return tweet
|
||||||
|
|
||||||
def write_tweet_json(js, folder):
|
def write_json(js, folder, oid=None):
|
||||||
tweetid = js['id']
|
if not oid:
|
||||||
file = tweet_file(tweetid, folder)
|
oid = js['id']
|
||||||
|
file = id_file(oid, folder)
|
||||||
if not os.path.exists(folder):
|
if not os.path.exists(folder):
|
||||||
os.makedirs(folder)
|
os.makedirs(folder)
|
||||||
with open(file, 'w') as f:
|
with open(file, 'w') as f:
|
||||||
json.dump(js, f, indent=2)
|
json.dump(js, f)
|
||||||
logger.info('Written {} to file {}'.format(tweetid, file))
|
logger.info('Written {} to file {}'.format(oid, file))
|
||||||
|
|
||||||
def tweet_file(tweetid, folder):
|
def id_file(oid, folder):
|
||||||
return os.path.join(folder, '%s.json' % tweetid)
|
return os.path.join(folder, '%s.json' % oid)
|
||||||
|
|
||||||
def tweet_fail_file(tweetid, folder):
|
def fail_file(oid, folder):
|
||||||
failsfolder = os.path.join(folder, 'failed')
|
failsfolder = os.path.join(folder, 'failed')
|
||||||
if not os.path.exists(failsfolder):
|
if not os.path.exists(failsfolder):
|
||||||
os.makedirs(failsfolder)
|
os.makedirs(failsfolder)
|
||||||
return os.path.join(failsfolder, '%s.failed' % tweetid)
|
return os.path.join(failsfolder, '%s.failed' % oid)
|
||||||
|
|
||||||
def tweet_failed(tweetid, folder):
|
def id_failed(oid, folder):
|
||||||
return os.path.isfile(tweet_fail_file(tweetid, folder))
|
return os.path.isfile(fail_file(oid, folder))
|
||||||
|
|
||||||
def download_tweets(wq, tweetsfile, folder, update=False, retry_failed=False, ignore_fails=True):
|
def tweet_download_batch(wq, batch):
|
||||||
def filter_line(line):
|
|
||||||
tweetid = int(line)
|
|
||||||
# print('Checking {}'.format(tweetid))
|
|
||||||
if (cached_tweet(tweetid, folder) and not update) or (tweet_failed(tweetid, folder) and not retry_failed):
|
|
||||||
yield None
|
|
||||||
else:
|
|
||||||
yield line
|
|
||||||
|
|
||||||
def print_result(res):
|
|
||||||
tid, tweet = res
|
|
||||||
if tweet:
|
|
||||||
try:
|
|
||||||
write_tweet_json(tweet, folder=folder)
|
|
||||||
yield 1
|
|
||||||
except Exception as ex:
|
|
||||||
logger.error('%s: %s' % (tid, ex))
|
|
||||||
if not ignore_fails:
|
|
||||||
raise
|
|
||||||
else:
|
|
||||||
logger.info('Tweet not recovered: {}'.format(tid))
|
|
||||||
with open(tweet_fail_file(tid, folder), 'w') as f:
|
|
||||||
print('Tweet not found', file=f)
|
|
||||||
yield -1
|
|
||||||
|
|
||||||
def download_batch(batch):
|
|
||||||
tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id']
|
tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id']
|
||||||
return tweets.items()
|
return tweets.items()
|
||||||
|
|
||||||
with open(tweetsfile) as f:
|
def user_download_batch(wq, batch):
|
||||||
lines = map(lambda x: x.strip(), f)
|
screen_names = []
|
||||||
lines_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_line, lines), desc='Total lines'))
|
user_ids = []
|
||||||
tweets = parallel(download_batch, lines_to_crawl, 100)
|
for elem in batch:
|
||||||
|
try:
|
||||||
|
user_ids.append(int(elem))
|
||||||
|
except ValueError:
|
||||||
|
screen_names.append(elem)
|
||||||
|
print('Downloading: {} - {}'.format(user_ids, screen_names))
|
||||||
|
users = wq.users.lookup(user_id=",".join(user_ids), screen_name=",".join(screen_names))
|
||||||
|
found_ids = []
|
||||||
|
found_names = []
|
||||||
|
for user in users:
|
||||||
|
uid = user['id']
|
||||||
|
if uid in user_ids:
|
||||||
|
found_ids.append(uid)
|
||||||
|
yield (uid, user)
|
||||||
|
uname = user['screen_name']
|
||||||
|
if uname in screen_names:
|
||||||
|
found_names.append(uname)
|
||||||
|
yield (uname, user)
|
||||||
|
for uid in set(user_ids) - set(found_ids):
|
||||||
|
yield (uid, None)
|
||||||
|
for name in set(screen_names) - set(found_names):
|
||||||
|
yield (name, None)
|
||||||
|
|
||||||
|
|
||||||
|
def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=True,
|
||||||
|
batch_method=tweet_download_batch):
|
||||||
|
def filter_lines(line):
|
||||||
|
# print('Checking {}'.format(line))
|
||||||
|
oid = line[0]
|
||||||
|
if (cached_id(oid, folder) and not update) or (id_failed(oid, folder) and not retry_failed):
|
||||||
|
yield None
|
||||||
|
else:
|
||||||
|
yield str(oid)
|
||||||
|
|
||||||
|
def print_result(res):
|
||||||
|
for oid, obj in res:
|
||||||
|
if obj:
|
||||||
|
try:
|
||||||
|
write_json(obj, folder=folder, oid=oid)
|
||||||
|
yield 1
|
||||||
|
except Exception as ex:
|
||||||
|
logger.error('%s: %s' % (oid, ex))
|
||||||
|
if not ignore_fails:
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
logger.info('Object not recovered: {}'.format(oid))
|
||||||
|
with open(fail_file(oid, folder), 'w') as f:
|
||||||
|
print('Object not found', file=f)
|
||||||
|
yield -1
|
||||||
|
|
||||||
|
objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects'))
|
||||||
|
batch_method = partial(batch_method, wq)
|
||||||
|
tweets = parallel(batch_method, objects_to_crawl, 100)
|
||||||
for res in tqdm(parallel(print_result, tweets), desc='Queried'):
|
for res in tqdm(parallel(print_result, tweets), desc='Queried'):
|
||||||
pass
|
yield res
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(wq, csvfile, folder, column=0, delimiter=',',
|
||||||
|
header=False, quotechar='"', batch_method=tweet_download_batch,
|
||||||
|
**kwargs):
|
||||||
|
with open(csvfile) as f:
|
||||||
|
csvreader = csv.reader(f, delimiter=str(delimiter), quotechar=str(quotechar))
|
||||||
|
if header:
|
||||||
|
next(csvreader)
|
||||||
|
tweets = map(lambda row: row[0].strip(), csvreader)
|
||||||
|
for res in download_list(wq, tweets, folder, batch_method=batch_method,
|
||||||
|
**kwargs):
|
||||||
|
yield res
|
||||||
|
|
||||||
|
|
||||||
def download_timeline(wq, user):
|
def download_timeline(wq, user):
|
||||||
return wq.statuses.user_timeline(id=user)
|
return wq.statuses.user_timeline(id=user)
|
||||||
|
@ -54,7 +54,7 @@ class TestUtils(TestCase):
|
|||||||
toc = time.time()
|
toc = time.time()
|
||||||
assert (tic-toc) < 600
|
assert (tic-toc) < 600
|
||||||
resp2 = utils.parallel(echo, [1,2,3,4], chunksize=2)
|
resp2 = utils.parallel(echo, [1,2,3,4], chunksize=2)
|
||||||
assert list(resp2) == [1,2,3,4]
|
assert list(resp2) == [1,2, 3,4]
|
||||||
|
|
||||||
|
|
||||||
class TestUtilsEnv(TestUtils):
|
class TestUtilsEnv(TestUtils):
|
||||||
@ -68,5 +68,3 @@ class TestUtilsEnv(TestUtils):
|
|||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
if hasattr(self, 'oldenv'):
|
if hasattr(self, 'oldenv'):
|
||||||
os.environ['BITTER_CONFIG'] = self.oldenv
|
os.environ['BITTER_CONFIG'] = self.oldenv
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user