mirror of
https://github.com/balkian/bitter.git
synced 2024-12-22 00:18:12 +00:00
Improve CLI. Add credentials
This commit is contained in:
parent
6259013978
commit
5199d5b5aa
82
README.md
82
README.md
@ -32,9 +32,89 @@ bitter api statuses/user_timeline --id thepsf --count 500
|
||||
```
|
||||
|
||||
|
||||
## Adding credentials
|
||||
|
||||
```
|
||||
bitter --config <YOUR CONFIGURATION FILE> credentials add
|
||||
```
|
||||
|
||||
You can specify the parameters in the command or let the command line guide you through the process.
|
||||
|
||||
# Examples
|
||||
|
||||
The CLI can query the rest API:
|
||||
## Downloading a list of tweets
|
||||
|
||||
Bitter can download tweets from a list of tweets in a CSV file.
|
||||
The result is stored as individual json files in your folder of choice.
|
||||
You can even specify the column number for tweet ids.
|
||||
Bitter will not try to download
|
||||
|
||||
```
|
||||
Usage: bitter tweet get_all [OPTIONS] TWEETSFILE
|
||||
|
||||
Download tweets from a list of tweets in a CSV file. The result is stored
|
||||
as individual json files in your folder of choice.
|
||||
|
||||
Options:
|
||||
-f, --folder TEXT
|
||||
-d, --delimiter TEXT
|
||||
-h, --header Discard the first line (use it as a header)
|
||||
-q, --quotechar TEXT
|
||||
-c, --column INTEGER
|
||||
--help Show this message and exit.
|
||||
|
||||
```
|
||||
|
||||
For instance, this will download `tweet_ids.csv` in the `tweet_info` folder:
|
||||
|
||||
```
|
||||
bitter tweet get_all -f tweet_info tweet_ids.csv
|
||||
```
|
||||
|
||||
## Downloading a list of users
|
||||
|
||||
Bitter downloads users and tweets in a similar way:
|
||||
|
||||
```
|
||||
Usage: bitter users get_all [OPTIONS] USERSFILE
|
||||
|
||||
Download users from a list of user ids/screen names in a CSV file. The
|
||||
result is stored as individual json files in your folder of choice.
|
||||
|
||||
Options:
|
||||
-f, --folder TEXT
|
||||
-d, --delimiter TEXT
|
||||
-h, --header Discard the first line (use it as a header)
|
||||
-q, --quotechar TEXT
|
||||
-c, --column INTEGER
|
||||
--help Show this message and exit.
|
||||
```
|
||||
|
||||
The only difference is that users can be downloaded via `screen_name` or `user_id`.
|
||||
This method does not try to resolve screen names to user ids, so users may be downloaded more than once if they appear in both ways.
|
||||
|
||||
## Downloading a stream
|
||||
|
||||
```
|
||||
Usage: bitter stream get [OPTIONS]
|
||||
|
||||
Options:
|
||||
-l, --locations TEXT
|
||||
-t, --track TEXT
|
||||
-f, --file TEXT File to store the stream of tweets. Default: standard output
|
||||
-p, --politelyretry Politely retry after a hangup/connection error
|
||||
--help Show this message and exit.
|
||||
```
|
||||
|
||||
```
|
||||
bitter --config .bitter.yaml stream get
|
||||
```
|
||||
python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines
|
||||
|
||||
|
||||
## REST queries
|
||||
|
||||
In newer versions of bitter, individual methods to download tweets/users using the REST API are being replaced with a generic method to call the API.
|
||||
|
||||
```
|
||||
bitter api <URL endpoint> --parameter VALUE ... | [--tweets | --users] [--max_count MAX_COUNT] [--count COUNT_PER_CALL]
|
||||
|
@ -1 +1 @@
|
||||
0.8.0
|
||||
0.9.0
|
||||
|
@ -41,6 +41,32 @@ def main(ctx, verbose, logging_level, config, credentials):
|
||||
if os.path.exists(utils.get_config_path(credentials)):
|
||||
utils.copy_credentials_to_config(credentials, config)
|
||||
|
||||
|
||||
@main.group()
|
||||
@click.pass_context
|
||||
def credentials(ctx):
|
||||
pass
|
||||
|
||||
@credentials.command('add')
|
||||
@click.option('--consumer_key', default=None)
|
||||
@click.option('--consumer_secret', default=None)
|
||||
@click.option('--token_key', default=None)
|
||||
@click.option('--token_secret', default=None)
|
||||
@click.argument('user_name')
|
||||
def add(user_name, consumer_key, consumer_secret, token_key, token_secret):
|
||||
if not consumer_key:
|
||||
consumer_key = click.prompt('Please, enter your YOUR CONSUMER KEY')
|
||||
if not consumer_secret:
|
||||
consumer_secret = click.prompt('Please, enter your CONSUMER SECRET')
|
||||
if not token_key:
|
||||
token_key = click.prompt('Please, enter your ACCESS TOKEN')
|
||||
if not token_secret:
|
||||
token_secret = click.prompt('Please, enter your ACCESS TOKEN SECRET')
|
||||
utils.add_credentials(conffile=bconf.CONFIG_FILE, user=user_name, consumer_key=consumer_key, consumer_secret=consumer_secret,
|
||||
token_key=token_key, token_secret=token_secret)
|
||||
click.echo('Credentials added for {}'.format(user_name))
|
||||
|
||||
|
||||
@main.group()
|
||||
@click.pass_context
|
||||
def tweet(ctx):
|
||||
@ -52,22 +78,36 @@ def tweet(ctx):
|
||||
@click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False)
|
||||
@click.argument('tweetid')
|
||||
def get_tweet(tweetid, write, folder, update):
|
||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||
utils.download_tweet(wq, tweetid, write, folder, update)
|
||||
|
||||
@tweet.command('get_all')
|
||||
|
||||
@tweet.command('get_all', help='''Download tweets from a list of tweets in a CSV file.
|
||||
The result is stored as individual json files in your folder of choice.''')
|
||||
@click.argument('tweetsfile', 'File with a list of tweets to look up')
|
||||
@click.option('-f', '--folder', default="tweets")
|
||||
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
|
||||
@click.option('-d', '--delimiter', default=",")
|
||||
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
|
||||
is_flag=True, default=False)
|
||||
@click.option('-q', '--quotechar', default='"')
|
||||
@click.option('-c', '--column', type=int, default=0)
|
||||
@click.pass_context
|
||||
def get_tweets(ctx, tweetsfile, folder):
|
||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
||||
utils.download_tweets(wq, tweetsfile, folder)
|
||||
def get_tweets(ctx, tweetsfile, folder, update, delimiter, header, quotechar, column):
|
||||
if update and not click.confirm('This may overwrite existing tweets. Continue?'):
|
||||
click.echo('Cancelling')
|
||||
return
|
||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||
for i in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
|
||||
batch_method=utils.tweet_download_batch,
|
||||
header=header, quotechar=quotechar,
|
||||
column=column, update=update):
|
||||
pass
|
||||
|
||||
@tweet.command('search')
|
||||
@click.argument('query')
|
||||
@click.pass_context
|
||||
def search(ctx, query):
|
||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||
t = utils.search_tweet(wq, query)
|
||||
print(json.dumps(t, indent=2))
|
||||
|
||||
@ -75,7 +115,7 @@ def search(ctx, query):
|
||||
@click.argument('user')
|
||||
@click.pass_context
|
||||
def timeline(ctx, user):
|
||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||
t = utils.user_timeline(wq, user)
|
||||
print(json.dumps(t, indent=2))
|
||||
|
||||
@ -101,7 +141,7 @@ def list_users(ctx, db):
|
||||
@click.option('-f', '--folder', default="users")
|
||||
@click.option('-u', '--update', help="Update the file even if the user exists", is_flag=True, default=False)
|
||||
def get_user(user, write, folder, update):
|
||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||
if not write:
|
||||
u = utils.get_user(wq, user)
|
||||
js = json.dumps(u, indent=2)
|
||||
@ -118,15 +158,28 @@ def get_user(user, write, folder, update):
|
||||
js = json.dumps(u, indent=2)
|
||||
print(js, file=f)
|
||||
|
||||
@users.command('get_all')
|
||||
@users.command('get_all', help='''Download users from a list of user ids/screen names in a CSV file.
|
||||
The result is stored as individual json files in your folder of choice.''')
|
||||
@click.argument('usersfile', 'File with a list of users to look up')
|
||||
@click.option('-f', '--folder', default="users")
|
||||
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
|
||||
@click.option('-d', '--delimiter', default=",")
|
||||
@click.option('-h', '--header', help='Discard the first line (use it as a header)',
|
||||
is_flag=True, default=False)
|
||||
@click.option('-q', '--quotechar', default='"')
|
||||
@click.option('-c', '--column', type=int, default=0)
|
||||
@click.pass_context
|
||||
def get_users(ctx, usersfile, folder):
|
||||
with open(usersfile) as f:
|
||||
for line in f:
|
||||
uid = line.strip()
|
||||
ctx.invoke(get_user, folder=folder, user=uid, write=True)
|
||||
def get_users(ctx, usersfile, folder, update, delimiter, header, quotechar, column):
|
||||
if update and not click.confirm('This may overwrite existing users. Continue?'):
|
||||
click.echo('Cancelling')
|
||||
return
|
||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||
for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter,
|
||||
batch_method=utils.user_download_batch,
|
||||
update=update,
|
||||
header=header, quotechar=quotechar,
|
||||
column=column):
|
||||
pass
|
||||
|
||||
@users.command('crawl')
|
||||
@click.option('--db', required=True, help='Database to save all users.')
|
||||
@ -147,7 +200,7 @@ def crawl_users(ctx, usersfile, skip, until, threads, db):
|
||||
return ExitStack()
|
||||
|
||||
|
||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||
logger.info('Starting Network crawler with {} threads and {} credentials.'.format(threads,
|
||||
len(wq.queue)))
|
||||
|
||||
@ -311,7 +364,7 @@ def users_extractor(ctx):
|
||||
@click.pass_context
|
||||
def extract(ctx, recursive, user, name, initfile):
|
||||
print(locals())
|
||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||
dburi = ctx.obj['DBURI']
|
||||
utils.extract(wq,
|
||||
recursive=recursive,
|
||||
@ -323,7 +376,7 @@ def extract(ctx, recursive, user, name, initfile):
|
||||
@extractor.command('reset')
|
||||
@click.pass_context
|
||||
def reset_extractor(ctx):
|
||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||
db = ctx.obj['DBURI']
|
||||
session = make_session(db)
|
||||
session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})
|
||||
@ -332,7 +385,7 @@ def reset_extractor(ctx):
|
||||
@click.argument('url', required=False)
|
||||
@click.pass_context
|
||||
def get_limits(ctx, url):
|
||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||
total = {}
|
||||
for worker in wq.queue:
|
||||
resp = worker.client.application.rate_limit_status()
|
||||
@ -357,7 +410,8 @@ def get_limits(ctx, url):
|
||||
|
||||
|
||||
|
||||
@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False))
|
||||
@main.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=False),
|
||||
help='''Issue a call to an endpoint of the Twitter API.''')
|
||||
@click.argument('cmd', nargs=1)
|
||||
@click.option('--tweets', is_flag=True, help='Fetch more tweets using smart pagination. Use --count to control how many tweets to fetch per call, and --max_count to set the number of desired tweets (or -1 to get as many as possible).', type=bool, default=False)
|
||||
@click.option('--users', is_flag=True, help='Fetch more users using smart pagination. Use --count to control how many users to fetch per call, and --max_count to set the number of desired users (or -1 to get as many as possible).', type=bool, default=False)
|
||||
@ -374,7 +428,7 @@ def api(ctx, cmd, tweets, users, api_args):
|
||||
if k in mappings:
|
||||
k = mappings[k]
|
||||
opts[k] = v
|
||||
wq = crawlers.TwitterQueue.from_config(bconf.CONFIG_FILE)
|
||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||
if tweets:
|
||||
resp = utils.consume_tweets(wq[cmd], **opts)
|
||||
elif users:
|
||||
@ -409,7 +463,7 @@ def stream(ctx):
|
||||
@click.option('-p', '--politelyretry', help='Politely retry after a hangup/connection error', is_flag=True, default=True)
|
||||
@click.pass_context
|
||||
def get_stream(ctx, locations, track, file, politelyretry):
|
||||
wq = crawlers.StreamQueue.from_config(bconf.CONFIG_FILE, 1)
|
||||
wq = crawlers.StreamQueue.from_config(conffile=bconf.CONFIG_FILE, max_workers=1)
|
||||
|
||||
query_args = {}
|
||||
if locations:
|
||||
|
@ -61,12 +61,14 @@ class FromCredentialsMixin(object):
|
||||
class FromConfigMixin(object):
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, conffile=None, max_workers=None):
|
||||
def from_config(cls, config=None, conffile=None, max_workers=None):
|
||||
wq = cls()
|
||||
|
||||
with utils.config(conffile) as c:
|
||||
for cred in islice(c['credentials'], max_workers):
|
||||
wq.ready(cls.worker_class(cred["user"], cred))
|
||||
if not config:
|
||||
with utils.config(conffile) as c:
|
||||
config = c
|
||||
for cred in islice(config['credentials'], max_workers):
|
||||
wq.ready(cls.worker_class(cred["user"], cred))
|
||||
return wq
|
||||
|
||||
class TwitterWorker(object):
|
||||
|
148
bitter/utils.py
148
bitter/utils.py
@ -4,6 +4,7 @@ import logging
|
||||
import time
|
||||
import json
|
||||
import yaml
|
||||
import csv
|
||||
import io
|
||||
|
||||
import signal
|
||||
@ -93,7 +94,7 @@ def read_config(conffile):
|
||||
p = conffile and get_config_path(conffile)
|
||||
if p:
|
||||
if not os.path.exists(p):
|
||||
raise Exception('{} file does not exist.'.format(p))
|
||||
raise IOError('{} file does not exist.'.format(p))
|
||||
f = open(p, 'r')
|
||||
elif 'BITTER_CONFIG' not in os.environ:
|
||||
raise Exception('No config file or BITTER_CONFIG env variable.')
|
||||
@ -103,6 +104,8 @@ def read_config(conffile):
|
||||
|
||||
|
||||
def write_config(conf, conffile=None):
|
||||
if not conf:
|
||||
conf = {'credentials': []}
|
||||
if conffile:
|
||||
p = get_config_path(conffile)
|
||||
with open(p, 'w') as f:
|
||||
@ -122,6 +125,7 @@ def create_config_file(conffile=None):
|
||||
conffile = get_config_path(conffile)
|
||||
with open(conffile, 'a'):
|
||||
pass
|
||||
write_config(None, conffile)
|
||||
|
||||
|
||||
def get_credentials(conffile=None, inverse=False, **kwargs):
|
||||
@ -142,7 +146,11 @@ def delete_credentials(conffile=None, **creds):
|
||||
|
||||
|
||||
def add_credentials(conffile=None, **creds):
|
||||
exist = get_credentials(conffile, **creds)
|
||||
try:
|
||||
exist = get_credentials(conffile, **creds)
|
||||
except IOError:
|
||||
exist = False
|
||||
create_config_file(conffile)
|
||||
if exist:
|
||||
return
|
||||
with config(conffile) as c:
|
||||
@ -451,86 +459,128 @@ def get_user(c, user):
|
||||
return c.users.lookup(screen_name=user)[0]
|
||||
|
||||
def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False):
|
||||
cached = cached_tweet(tweetid, folder)
|
||||
cached = cached_id(tweetid, folder)
|
||||
tweet = None
|
||||
if update or not cached:
|
||||
tweet = get_tweet(wq, tweetid)
|
||||
js = json.dumps(tweet, indent=2)
|
||||
js = json.dumps(tweet)
|
||||
if write:
|
||||
if tweet:
|
||||
write_tweet_json(js, folder)
|
||||
write_json(js, folder)
|
||||
else:
|
||||
print(js)
|
||||
|
||||
|
||||
def cached_tweet(tweetid, folder):
|
||||
def cached_id(oid, folder):
|
||||
tweet = None
|
||||
file = os.path.join(folder, '%s.json' % tweetid)
|
||||
file = os.path.join(folder, '%s.json' % oid)
|
||||
if os.path.exists(file) and os.path.isfile(file):
|
||||
try:
|
||||
# print('%s: Tweet exists' % tweetid)
|
||||
# print('%s: Object exists' % oid)
|
||||
with open(file) as f:
|
||||
tweet = json.load(f)
|
||||
except Exception as ex:
|
||||
logger.error('Error getting cached version of {}: {}'.format(tweetid, ex))
|
||||
logger.error('Error getting cached version of {}: {}'.format(oid, ex))
|
||||
return tweet
|
||||
|
||||
def write_tweet_json(js, folder):
|
||||
tweetid = js['id']
|
||||
file = tweet_file(tweetid, folder)
|
||||
def write_json(js, folder, oid=None):
|
||||
if not oid:
|
||||
oid = js['id']
|
||||
file = id_file(oid, folder)
|
||||
if not os.path.exists(folder):
|
||||
os.makedirs(folder)
|
||||
with open(file, 'w') as f:
|
||||
json.dump(js, f, indent=2)
|
||||
logger.info('Written {} to file {}'.format(tweetid, file))
|
||||
json.dump(js, f)
|
||||
logger.info('Written {} to file {}'.format(oid, file))
|
||||
|
||||
def tweet_file(tweetid, folder):
|
||||
return os.path.join(folder, '%s.json' % tweetid)
|
||||
def id_file(oid, folder):
|
||||
return os.path.join(folder, '%s.json' % oid)
|
||||
|
||||
def tweet_fail_file(tweetid, folder):
|
||||
def fail_file(oid, folder):
|
||||
failsfolder = os.path.join(folder, 'failed')
|
||||
if not os.path.exists(failsfolder):
|
||||
os.makedirs(failsfolder)
|
||||
return os.path.join(failsfolder, '%s.failed' % tweetid)
|
||||
return os.path.join(failsfolder, '%s.failed' % oid)
|
||||
|
||||
def tweet_failed(tweetid, folder):
|
||||
return os.path.isfile(tweet_fail_file(tweetid, folder))
|
||||
def id_failed(oid, folder):
|
||||
return os.path.isfile(fail_file(oid, folder))
|
||||
|
||||
def download_tweets(wq, tweetsfile, folder, update=False, retry_failed=False, ignore_fails=True):
|
||||
def filter_line(line):
|
||||
tweetid = int(line)
|
||||
# print('Checking {}'.format(tweetid))
|
||||
if (cached_tweet(tweetid, folder) and not update) or (tweet_failed(tweetid, folder) and not retry_failed):
|
||||
def tweet_download_batch(wq, batch):
|
||||
tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id']
|
||||
return tweets.items()
|
||||
|
||||
def user_download_batch(wq, batch):
|
||||
screen_names = []
|
||||
user_ids = []
|
||||
for elem in batch:
|
||||
try:
|
||||
user_ids.append(int(elem))
|
||||
except ValueError:
|
||||
screen_names.append(elem)
|
||||
print('Downloading: {} - {}'.format(user_ids, screen_names))
|
||||
users = wq.users.lookup(user_id=",".join(user_ids), screen_name=",".join(screen_names))
|
||||
found_ids = []
|
||||
found_names = []
|
||||
for user in users:
|
||||
uid = user['id']
|
||||
if uid in user_ids:
|
||||
found_ids.append(uid)
|
||||
yield (uid, user)
|
||||
uname = user['screen_name']
|
||||
if uname in screen_names:
|
||||
found_names.append(uname)
|
||||
yield (uname, user)
|
||||
for uid in set(user_ids) - set(found_ids):
|
||||
yield (uid, None)
|
||||
for name in set(screen_names) - set(found_names):
|
||||
yield (name, None)
|
||||
|
||||
|
||||
def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=True,
|
||||
batch_method=tweet_download_batch):
|
||||
def filter_lines(line):
|
||||
# print('Checking {}'.format(line))
|
||||
oid = line[0]
|
||||
if (cached_id(oid, folder) and not update) or (id_failed(oid, folder) and not retry_failed):
|
||||
yield None
|
||||
else:
|
||||
yield line
|
||||
yield str(oid)
|
||||
|
||||
def print_result(res):
|
||||
tid, tweet = res
|
||||
if tweet:
|
||||
try:
|
||||
write_tweet_json(tweet, folder=folder)
|
||||
yield 1
|
||||
except Exception as ex:
|
||||
logger.error('%s: %s' % (tid, ex))
|
||||
if not ignore_fails:
|
||||
raise
|
||||
else:
|
||||
logger.info('Tweet not recovered: {}'.format(tid))
|
||||
with open(tweet_fail_file(tid, folder), 'w') as f:
|
||||
print('Tweet not found', file=f)
|
||||
yield -1
|
||||
for oid, obj in res:
|
||||
if obj:
|
||||
try:
|
||||
write_json(obj, folder=folder, oid=oid)
|
||||
yield 1
|
||||
except Exception as ex:
|
||||
logger.error('%s: %s' % (oid, ex))
|
||||
if not ignore_fails:
|
||||
raise
|
||||
else:
|
||||
logger.info('Object not recovered: {}'.format(oid))
|
||||
with open(fail_file(oid, folder), 'w') as f:
|
||||
print('Object not found', file=f)
|
||||
yield -1
|
||||
|
||||
def download_batch(batch):
|
||||
tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id']
|
||||
return tweets.items()
|
||||
objects_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_lines, lst), desc='Total objects'))
|
||||
batch_method = partial(batch_method, wq)
|
||||
tweets = parallel(batch_method, objects_to_crawl, 100)
|
||||
for res in tqdm(parallel(print_result, tweets), desc='Queried'):
|
||||
yield res
|
||||
|
||||
|
||||
def download_file(wq, csvfile, folder, column=0, delimiter=',',
|
||||
header=False, quotechar='"', batch_method=tweet_download_batch,
|
||||
**kwargs):
|
||||
with open(csvfile) as f:
|
||||
csvreader = csv.reader(f, delimiter=str(delimiter), quotechar=str(quotechar))
|
||||
if header:
|
||||
next(csvreader)
|
||||
tweets = map(lambda row: row[0].strip(), csvreader)
|
||||
for res in download_list(wq, tweets, folder, batch_method=batch_method,
|
||||
**kwargs):
|
||||
yield res
|
||||
|
||||
with open(tweetsfile) as f:
|
||||
lines = map(lambda x: x.strip(), f)
|
||||
lines_to_crawl = filter(lambda x: x is not None, tqdm(parallel(filter_line, lines), desc='Total lines'))
|
||||
tweets = parallel(download_batch, lines_to_crawl, 100)
|
||||
for res in tqdm(parallel(print_result, tweets), desc='Queried'):
|
||||
pass
|
||||
|
||||
def download_timeline(wq, user):
|
||||
return wq.statuses.user_timeline(id=user)
|
||||
|
@ -54,7 +54,7 @@ class TestUtils(TestCase):
|
||||
toc = time.time()
|
||||
assert (tic-toc) < 600
|
||||
resp2 = utils.parallel(echo, [1,2,3,4], chunksize=2)
|
||||
assert list(resp2) == [1,2,3,4]
|
||||
assert list(resp2) == [1,2, 3,4]
|
||||
|
||||
|
||||
class TestUtilsEnv(TestUtils):
|
||||
@ -68,5 +68,3 @@ class TestUtilsEnv(TestUtils):
|
||||
def tearDown(self):
|
||||
if hasattr(self, 'oldenv'):
|
||||
os.environ['BITTER_CONFIG'] = self.oldenv
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user