diff --git a/bitter/VERSION b/bitter/VERSION index 5eef0f1..a3f5a8e 100644 --- a/bitter/VERSION +++ b/bitter/VERSION @@ -1 +1 @@ -0.10.2 +0.10.3 diff --git a/bitter/__init__.py b/bitter/__init__.py index 1041afa..0e4ae0e 100644 --- a/bitter/__init__.py +++ b/bitter/__init__.py @@ -6,10 +6,12 @@ http://github.com/balkian/bitter import os from .version import __version__ +from . import config as bconf -def easy(*args, **kwargs): +def easy(conffile=bconf.CONFIG_FILE): from .crawlers import TwitterQueue - return TwitterQueue.from_credentials(*args, **kwargs) + + return TwitterQueue.from_config(conffile=conffile) __all__ = ['cli', 'config', 'crawlers', 'models', 'utils' ] diff --git a/bitter/cli.py b/bitter/cli.py index 1cc9106..9c37b19 100644 --- a/bitter/cli.py +++ b/bitter/cli.py @@ -8,8 +8,6 @@ import time import sqlalchemy.types import threading import sqlite3 -import operator -from functools import reduce from tqdm import tqdm from sqlalchemy import exists @@ -19,7 +17,6 @@ from bitter import config as bconf from bitter.models import make_session, User, ExtractorEntry, Following import sys -import csv as tsv if sys.version_info <= (3, 0): from contextlib2 import ExitStack else: @@ -29,48 +26,29 @@ else: logger = logging.getLogger(__name__) - - def serialize(function): '''Common options to serialize output to CSV or other formats''' - @click.option('--csv', help='Print each object as a csv row. Provide a list of comma-separated fields to print.', default='', type=str) + @click.option('--fields', help='Provide a list of comma-separated fields to print.', default='', type=str) + @click.option('--ignore_missing', help='Do not show warnings for missing fields.', is_flag=True) @click.option('--header', help='Header that will be printed at the beginning of the file', default=None) + @click.option('--csv', help='Print each object as a csv row.', is_flag=True) @click.option('--jsonlines', '--json', help='Print each object as JSON in a new line.', is_flag=True) @click.option('--indented', help='Print each object as an indented JSON object', is_flag=True) + @click.option('--outdelimiter', help='Delimiter for some output formats, such as CSV. It defaults to \t', default='\t') @click.option('--outfile', help='Output file. It defaults to STDOUT', default=sys.stdout) - def decorated(csv, header, jsonlines, indented, outfile, **kwargs): - if header: - print(header) - + def decorated(fields, ignore_missing, header, csv, jsonlines, indented, outfile, outdelimiter, **kwargs): it = function(**kwargs) + outformat = 'json' + if csv: + outformat = 'csv' + elif jsonlines: + outformat = 'jsonlines' + elif indented: + outformat = 'indented' - def do(out): - - if csv: - delimiter = '\t' - writer = tsv.writer(out, quoting=tsv.QUOTE_ALL, delimiter=delimiter) - if header is None: - # Print fields as header unless told otherwise - print(csv.replace(',', delimiter), file=out) - fields = list(token.strip().split('.') for token in csv.split(',')) - for obj in it: - writer.writerow(list(reduce(operator.getitem, field, obj) for field in fields)) - elif jsonlines: - for obj in it: - print(json.dumps(obj, sort_keys=True), file=out) - elif indented: - for obj in it: - print(json.dumps(obj, indent=4, sort_keys=True), file=out) - else: - for obj in it: - print(obj, file=out) - - if outfile is sys.stdout: - return do(sys.stdout) + return utils.serialized(it, outfile, outformat=outformat, fields=fields.split(','), ignore_missing=ignore_missing, header=header, delimiter=outdelimiter) - with open(outfile, 'w') as out: - return do(out) return decorated @@ -190,13 +168,14 @@ The result is stored as individual json files in your folder of choice.''') @click.option('-u', '--update', is_flag=True, default=False, help='Download tweet even if it is already present. WARNING: it will overwrite existing files!') @click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads') @click.option('-d', '--delimiter', default=",") +@click.option('-n', '--nocache', is_flag=True, default=False, help='Do not cache results') @click.option('--skip', help='Discard the first DISCARD lines (use them as a header)', default=0) @click.option('--commentchar', help='Lines starting with this character will be ignored', default=None) @click.option('-q', '--quotechar', default='"') @click.option('-c', '--column', type=int, default=0) @serialize @click.pass_context -def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, skip, quotechar, commentchar, column): +def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, nocache, skip, quotechar, commentchar, column): if update and not click.confirm('This may overwrite existing tweets. Continue?'): click.echo('Cancelling') return @@ -204,10 +183,9 @@ def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, skip, quotecha status = tqdm('Queried') failed = 0 - for tid, obj in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter, - batch_method=utils.tweet_download_batch, - skip=skip, quotechar=quotechar, commentchar=commentchar, - column=column, update=update, retry_failed=retry): + for tid, obj in utils.download_tweets_file(wq, tweetsfile, folder, delimiter=delimiter, cache=not nocache, + skip=skip, quotechar=quotechar, commentchar=commentchar, + column=column, update=update, retry_failed=retry): status.update(1) if not obj: failed += 1 @@ -264,6 +242,7 @@ def get_user(user, dry_run, folder, update): @click.option('-f', '--folder', default="users") @click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!') @click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads') +@click.option('-n', '--nocache', is_flag=True, default=False, help='Do not cache results') @click.option('-d', '--delimiter', default=",") @click.option('--skip', help='Discard the first SKIP lines (e.g., use them as a header)', is_flag=True, default=False) @@ -272,17 +251,17 @@ def get_user(user, dry_run, folder, update): @click.option('-c', '--column', type=int, default=0) @serialize @click.pass_context -def get_users(ctx, usersfile, folder, update, retry, delimiter, skip, quotechar, commentchar, column): +def get_users(ctx, usersfile, folder, update, retry, nocache, delimiter, skip, quotechar, commentchar, column): if update and not click.confirm('This may overwrite existing users. Continue?'): click.echo('Cancelling') return wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) - for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter, - batch_method=utils.user_download_batch, - update=update, retry_failed=retry, - skip=skip, quotechar=quotechar, - commentchar=commentchar, - column=column): + for i in utils.download_users_file(wq, usersfile, folder, delimiter=delimiter, + update=update, retry_failed=retry, + skip=skip, quotechar=quotechar, + cache=not nocache, + commentchar=commentchar, + column=column): yield i @users.command('crawl') @@ -480,7 +459,6 @@ def extract(ctx, recursive, user, name, initfile): @extractor.command('reset') @click.pass_context def reset_extractor(ctx): - wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) db = ctx.obj['DBURI'] session = make_session(db) session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False}) diff --git a/bitter/utils.py b/bitter/utils.py index 8f39470..68233fc 100644 --- a/bitter/utils.py +++ b/bitter/utils.py @@ -19,7 +19,9 @@ import queue import threading from select import select -from functools import partial +import operator + +from functools import partial, reduce from tqdm import tqdm @@ -473,22 +475,22 @@ def get_user(c, user): except ValueError: return c.users.lookup(screen_name=user)[0] -def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False): +def download_tweet(wq, tweetid, cache=True, folder="downloaded_tweets", update=False): tweet = cached_id(tweetid, folder) if update or not tweet: tweet = get_tweet(wq, tweetid) - if write and update: + if cache and update: if tweet: js = json.dumps(tweet) write_json(js, folder) yield tweet -def download_user(wq, userid, write=True, folder="downloaded_users", update=False): +def download_user(wq, userid, cache=True, folder="downloaded_users", update=False): user = cached_id(userid, folder) if update or not user: user = get_user(wq, userid) - if write and update: + if cache and update: if user: write_json(user, folder, aliases=[user['screen_name'], ]) yield user @@ -589,7 +591,8 @@ def dump_result(oid, obj, folder, ignore_fails=True): with open(fail_file(oid, folder), 'w') as f: print('Object not found', file=f) -def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=False, + +def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=False, cache=True, batch_method=tweet_download_batch): done = Queue() @@ -647,13 +650,24 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail break oid, obj = rec - dump_result(oid, obj, folder, ignore_fails) + if cache or (not obj): + dump_result(oid, obj, folder, ignore_fails) yield rec wait.join() -def download_file(wq, csvfile, folder, column=0, delimiter=',', skip=0, +def download_tweets_file(*args, **kwargs): + kwargs['batch_method'] = tweet_download_batch + yield from download_file(*args, **kwargs) + + +def download_users_file(*args, **kwargs): + kwargs['batch_method'] = user_download_batch + yield from download_file(*args, **kwargs) + + +def download_file(wq, csvfile, folder, column=0, delimiter=',', skip=0, cache=True, quotechar='"', commentchar=None, batch_method=tweet_download_batch, **kwargs): with open(csvfile) as f: @@ -670,7 +684,7 @@ def download_file(wq, csvfile, folder, column=0, delimiter=',', skip=0, yield row[column].strip() - for res in download_list(wq, reader(csvreader), folder, batch_method=batch_method, + for res in download_list(wq, reader(csvreader), folder, batch_method=batch_method, cache=cache, **kwargs): yield res @@ -748,3 +762,42 @@ def _users_control(func, apiargs, remaining=0, **kwargs): if int(cursor) != -1: stop = False return resp['users'], stop + + +def serialized(it, outfile, outformat='csv', fields=[], header=None, ignore_missing=False, delimiter='\t'): + outformat = outformat.lower() + def do(out): + + if outformat == 'csv': + writer = csv.writer(out, quoting=csv.QUOTE_ALL, delimiter=delimiter) + if header != '': + h = header + if h is None: + h = delimiter.join(fields) + print(h, file=out) + attrs = list(token.strip().split('.') for token in fields) + for obj in it: + values = [] + for attr in attrs: + try: + values.append(reduce(operator.getitem, attr, obj)) + except KeyError: + if not ignore_missing: + print('Key not present: {}'.format(attr), file=sys.stderr) + values.append(None) + writer.writerow(values) + elif outformat == 'jsonlines': + for obj in it: + print(json.dumps(obj, sort_keys=True), file=out) + elif outformat == 'indented': + for obj in it: + print(json.dumps(obj, indent=4, sort_keys=True), file=out) + else: + for obj in it: + print(obj, file=out) + + if outfile is sys.stdout: + return do(sys.stdout) + + with open(outfile, 'w') as out: + return do(out)