1
0
mirror of https://github.com/balkian/bitter.git synced 2024-12-22 08:28:12 +00:00

Add serialize

This commit is contained in:
J. Fernando Sánchez 2019-11-12 17:15:26 +01:00
parent 40a8b45231
commit 80b58541e7
3 changed files with 89 additions and 24 deletions

View File

@ -1 +1 @@
0.9.7 0.10.0

View File

@ -8,6 +8,8 @@ import time
import sqlalchemy.types import sqlalchemy.types
import threading import threading
import sqlite3 import sqlite3
import operator
from functools import reduce
from tqdm import tqdm from tqdm import tqdm
from sqlalchemy import exists from sqlalchemy import exists
@ -17,6 +19,7 @@ from bitter import config as bconf
from bitter.models import make_session, User, ExtractorEntry, Following from bitter.models import make_session, User, ExtractorEntry, Following
import sys import sys
import csv as tsv
if sys.version_info <= (3, 0): if sys.version_info <= (3, 0):
from contextlib2 import ExitStack from contextlib2 import ExitStack
else: else:
@ -26,6 +29,50 @@ else:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def serialize(function):
'''Common options to serialize output to CSV or other formats'''
@click.option('--csv', help='Print each object as a csv row. Provide a list of comma-separated fields to print.', default='', type=str)
@click.option('--header', help='Header that will be printed at the beginning of the file', default=None)
@click.option('--json', '--jsonlines', help='Print each object as JSON in a new line.', is_flag=True)
@click.option('--indented', help='Print each object as an indented JSON object', is_flag=True)
@click.option('--outfile', help='Output file. It defaults to STDOUT', default=sys.stdout)
def decorated(csv, header, jsonlines, indented, outfile, **kwargs):
if header:
print(header)
it = function(**kwargs)
def do(out):
if csv:
writer = tsv.writer(out, quoting=tsv.QUOTE_ALL, delimiter='\t')
if header is None:
# Print fields as header unless told otherwise
print(csv, file=out)
fields = list(token.strip().split('.') for token in csv.split(','))
for obj in it:
writer.writerow(list(reduce(operator.getitem, field, obj) for field in fields))
elif jsonlines:
for obj in it:
print(json.dumps(obj, sort_keys=True), file=out)
elif indented:
for obj in it:
print(json.dumps(obj, indent=4, sort_keys=True), file=out)
else:
for obj in it:
print(obj, file=out)
if outfile is sys.stdout:
return do(sys.stdout)
with open(outfile, 'w') as out:
return do(out)
return decorated
@click.group() @click.group()
@click.option("--verbose", is_flag=True) @click.option("--verbose", is_flag=True)
@click.option("--logging_level", required=False, default='WARN') @click.option("--logging_level", required=False, default='WARN')
@ -125,9 +172,10 @@ def tweet(ctx):
@click.option('-f', '--folder', default="tweets") @click.option('-f', '--folder', default="tweets")
@click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False) @click.option('-u', '--update', help="Update the file even if the tweet exists", is_flag=True, default=False)
@click.argument('tweetid') @click.argument('tweetid')
@serialize
def get_tweet(tweetid, write, folder, update): def get_tweet(tweetid, write, folder, update):
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
utils.download_tweet(wq, tweetid, write, folder, update) yield from utils.download_tweet(wq, tweetid, write, folder, update)
@tweet.command('get_all', help='''Download tweets from a list of tweets in a CSV file. @tweet.command('get_all', help='''Download tweets from a list of tweets in a CSV file.
The result is stored as individual json files in your folder of choice.''') The result is stored as individual json files in your folder of choice.''')
@ -136,12 +184,13 @@ The result is stored as individual json files in your folder of choice.''')
@click.option('-u', '--update', is_flag=True, default=False, help='Download tweet even if it is already present. WARNING: it will overwrite existing files!') @click.option('-u', '--update', is_flag=True, default=False, help='Download tweet even if it is already present. WARNING: it will overwrite existing files!')
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads') @click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
@click.option('-d', '--delimiter', default=",") @click.option('-d', '--delimiter', default=",")
@click.option('-h', '--header', help='Discard the first line (use it as a header)', @click.option('--skip', help='Discard the first DISCARD lines (use them as a header)', default=0)
is_flag=True, default=False) @click.option('--commentchar', help='Lines starting with this character will be ignored', default=None)
@click.option('-q', '--quotechar', default='"') @click.option('-q', '--quotechar', default='"')
@click.option('-c', '--column', type=int, default=0) @click.option('-c', '--column', type=int, default=0)
@serialize
@click.pass_context @click.pass_context
def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, header, quotechar, column): def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, skip, quotechar, commentchar, column):
if update and not click.confirm('This may overwrite existing tweets. Continue?'): if update and not click.confirm('This may overwrite existing tweets. Continue?'):
click.echo('Cancelling') click.echo('Cancelling')
return return
@ -151,12 +200,15 @@ def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, header, quotec
failed = 0 failed = 0
for tid, obj in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter, for tid, obj in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
batch_method=utils.tweet_download_batch, batch_method=utils.tweet_download_batch,
header=header, quotechar=quotechar, skip=skip, quotechar=quotechar, commentchar=commentchar,
column=column, update=update, retry_failed=retry): column=column, update=update, retry_failed=retry):
status.update(1) status.update(1)
if not obj: if not obj:
failed += 1 failed += 1
status.set_description('Failed: %s. Queried' % failed, refresh=True) status.set_description('Failed: %s. Queried' % failed, refresh=True)
continue
yield obj
@tweet.command('search') @tweet.command('search')
@click.argument('query') @click.argument('query')
@ -220,12 +272,14 @@ def get_user(user, write, folder, update):
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!') @click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads') @click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
@click.option('-d', '--delimiter', default=",") @click.option('-d', '--delimiter', default=",")
@click.option('-h', '--header', help='Discard the first line (use it as a header)', @click.option('--skip', help='Discard the first SKIP lines (e.g., use them as a header)',
is_flag=True, default=False) is_flag=True, default=False)
@click.option('-q', '--quotechar', default='"') @click.option('-q', '--quotechar', default='"')
@click.option('--commentchar', help='Lines starting with this character will be ignored', default=None)
@click.option('-c', '--column', type=int, default=0) @click.option('-c', '--column', type=int, default=0)
@click.pass_context @click.pass_context
def get_users(ctx, usersfile, folder, update, retry, delimiter, header, quotechar, column): @serialize
def get_users(ctx, usersfile, folder, update, retry, delimiter, skip, quotechar, commentchar, column):
if update and not click.confirm('This may overwrite existing users. Continue?'): if update and not click.confirm('This may overwrite existing users. Continue?'):
click.echo('Cancelling') click.echo('Cancelling')
return return
@ -233,9 +287,10 @@ def get_users(ctx, usersfile, folder, update, retry, delimiter, header, quotecha
for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter, for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter,
batch_method=utils.user_download_batch, batch_method=utils.user_download_batch,
update=update, retry_failed=retry, update=update, retry_failed=retry,
header=header, quotechar=quotechar, skip=skip, quotechar=quotechar,
commentchar=commentchar,
column=column): column=column):
pass yield i
@users.command('crawl') @users.command('crawl')
@click.option('--db', required=True, help='Database to save all users.') @click.option('--db', required=True, help='Database to save all users.')

View File

@ -59,7 +59,14 @@ def chunk(iterable, n):
def parallel(func, source, chunksize=1, numcpus=multiprocessing.cpu_count()): def parallel(func, source, chunksize=1, numcpus=multiprocessing.cpu_count()):
source = chunk(source, chunksize) source = chunk(source, chunksize)
p = ThreadPool(numcpus*2) p = ThreadPool(numcpus*2)
results = p.imap_unordered(func, source)
def wrapped_func(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as ex:
print('Exception on parallel thread: {}'.format(ex), file=sys.stderr)
results = p.imap_unordered(wrapped_func, source)
for i in chain.from_iterable(results): for i in chain.from_iterable(results):
yield i yield i
@ -106,7 +113,7 @@ def read_config(conffile):
raise Exception('No config file or BITTER_CONFIG env variable.') raise Exception('No config file or BITTER_CONFIG env variable.')
else: else:
f = io.StringIO(unicode(os.environ.get('BITTER_CONFIG', "")).strip().replace('\\n', '\n')) f = io.StringIO(unicode(os.environ.get('BITTER_CONFIG', "")).strip().replace('\\n', '\n'))
return yaml.load(f) or {'credentials': []} return yaml.load(f, Loader=yaml.SafeLoader) or {'credentials': []}
def write_config(conf, conffile=None): def write_config(conf, conffile=None):
@ -419,10 +426,13 @@ def extract(wq, recursive=False, user=None, initfile=None, dburi=None, extractor
pending = pending_entries(dburi) pending = pending_entries(dburi)
session.close() session.close()
for i in tqdm(parallel(de, pending), desc='Downloading users', total=total_users): with tqdm(parallel(de, pending), desc='Downloading users', total=total_users) as tq:
for i in tq:
tq.write('Got {}'.format(i))
logger.info("Got %s" % i) logger.info("Got %s" % i)
def pending_entries(dburi): def pending_entries(dburi):
session = make_session(dburi) session = make_session(dburi)
while True: while True:
@ -465,16 +475,14 @@ def get_user(c, user):
return c.users.lookup(screen_name=user)[0] return c.users.lookup(screen_name=user)[0]
def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False): def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False):
cached = cached_id(tweetid, folder) tweet = cached_id(tweetid, folder)
tweet = None if update or not tweet:
if update or not cached:
tweet = get_tweet(wq, tweetid) tweet = get_tweet(wq, tweetid)
js = json.dumps(tweet)
if write: if write:
if tweet: if tweet:
js = json.dumps(tweet)
write_json(js, folder) write_json(js, folder)
else: yield tweet
print(js)
def cached_id(oid, folder): def cached_id(oid, folder):
@ -581,7 +589,6 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail
def filter_list(lst, done, down): def filter_list(lst, done, down):
print('filtering') print('filtering')
for oid in lst: for oid in lst:
# print('Checking {}'.format(line))
cached = cached_id(oid, folder) cached = cached_id(oid, folder)
if (cached and not update): if (cached and not update):
done.put((oid, cached)) done.put((oid, cached))
@ -635,12 +642,15 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail
wait.join() wait.join()
def download_file(wq, csvfile, folder, column=0, delimiter=',', def download_file(wq, csvfile, folder, column=0, delimiter=',', skip=0,
header=False, quotechar='"', batch_method=tweet_download_batch, quotechar='"', commentchar=None, batch_method=tweet_download_batch,
**kwargs): **kwargs):
with open(csvfile) as f: with open(csvfile) as f:
if commentchar:
f = (line for line in f if not line.startswith('#'))
csvreader = csv.reader(f, delimiter=str(delimiter), quotechar=str(quotechar)) csvreader = csv.reader(f, delimiter=str(delimiter), quotechar=str(quotechar))
if header: for n in range(skip):
next(csvreader) next(csvreader)
def reader(r): def reader(r):