1
0
mirror of https://github.com/balkian/bitter.git synced 2024-12-22 00:18:12 +00:00

Version 0.10.3

* Ability to include "optional" fields from tweets (e.g., retweeted_status).
* Optional caching (for very large datasets)
This commit is contained in:
J. Fernando Sánchez 2020-06-24 11:01:02 +02:00
parent 030c41b826
commit ea848f1a78
4 changed files with 93 additions and 60 deletions

View File

@ -1 +1 @@
0.10.2 0.10.3

View File

@ -6,10 +6,12 @@ http://github.com/balkian/bitter
import os import os
from .version import __version__ from .version import __version__
from . import config as bconf
def easy(*args, **kwargs): def easy(conffile=bconf.CONFIG_FILE):
from .crawlers import TwitterQueue from .crawlers import TwitterQueue
return TwitterQueue.from_credentials(*args, **kwargs)
return TwitterQueue.from_config(conffile=conffile)
__all__ = ['cli', 'config', 'crawlers', 'models', 'utils' ] __all__ = ['cli', 'config', 'crawlers', 'models', 'utils' ]

View File

@ -8,8 +8,6 @@ import time
import sqlalchemy.types import sqlalchemy.types
import threading import threading
import sqlite3 import sqlite3
import operator
from functools import reduce
from tqdm import tqdm from tqdm import tqdm
from sqlalchemy import exists from sqlalchemy import exists
@ -19,7 +17,6 @@ from bitter import config as bconf
from bitter.models import make_session, User, ExtractorEntry, Following from bitter.models import make_session, User, ExtractorEntry, Following
import sys import sys
import csv as tsv
if sys.version_info <= (3, 0): if sys.version_info <= (3, 0):
from contextlib2 import ExitStack from contextlib2 import ExitStack
else: else:
@ -29,48 +26,29 @@ else:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def serialize(function): def serialize(function):
'''Common options to serialize output to CSV or other formats''' '''Common options to serialize output to CSV or other formats'''
@click.option('--csv', help='Print each object as a csv row. Provide a list of comma-separated fields to print.', default='', type=str) @click.option('--fields', help='Provide a list of comma-separated fields to print.', default='', type=str)
@click.option('--ignore_missing', help='Do not show warnings for missing fields.', is_flag=True)
@click.option('--header', help='Header that will be printed at the beginning of the file', default=None) @click.option('--header', help='Header that will be printed at the beginning of the file', default=None)
@click.option('--csv', help='Print each object as a csv row.', is_flag=True)
@click.option('--jsonlines', '--json', help='Print each object as JSON in a new line.', is_flag=True) @click.option('--jsonlines', '--json', help='Print each object as JSON in a new line.', is_flag=True)
@click.option('--indented', help='Print each object as an indented JSON object', is_flag=True) @click.option('--indented', help='Print each object as an indented JSON object', is_flag=True)
@click.option('--outdelimiter', help='Delimiter for some output formats, such as CSV. It defaults to \t', default='\t')
@click.option('--outfile', help='Output file. It defaults to STDOUT', default=sys.stdout) @click.option('--outfile', help='Output file. It defaults to STDOUT', default=sys.stdout)
def decorated(csv, header, jsonlines, indented, outfile, **kwargs): def decorated(fields, ignore_missing, header, csv, jsonlines, indented, outfile, outdelimiter, **kwargs):
if header:
print(header)
it = function(**kwargs) it = function(**kwargs)
outformat = 'json'
if csv:
outformat = 'csv'
elif jsonlines:
outformat = 'jsonlines'
elif indented:
outformat = 'indented'
def do(out): return utils.serialized(it, outfile, outformat=outformat, fields=fields.split(','), ignore_missing=ignore_missing, header=header, delimiter=outdelimiter)
if csv:
delimiter = '\t'
writer = tsv.writer(out, quoting=tsv.QUOTE_ALL, delimiter=delimiter)
if header is None:
# Print fields as header unless told otherwise
print(csv.replace(',', delimiter), file=out)
fields = list(token.strip().split('.') for token in csv.split(','))
for obj in it:
writer.writerow(list(reduce(operator.getitem, field, obj) for field in fields))
elif jsonlines:
for obj in it:
print(json.dumps(obj, sort_keys=True), file=out)
elif indented:
for obj in it:
print(json.dumps(obj, indent=4, sort_keys=True), file=out)
else:
for obj in it:
print(obj, file=out)
if outfile is sys.stdout:
return do(sys.stdout)
with open(outfile, 'w') as out:
return do(out)
return decorated return decorated
@ -190,13 +168,14 @@ The result is stored as individual json files in your folder of choice.''')
@click.option('-u', '--update', is_flag=True, default=False, help='Download tweet even if it is already present. WARNING: it will overwrite existing files!') @click.option('-u', '--update', is_flag=True, default=False, help='Download tweet even if it is already present. WARNING: it will overwrite existing files!')
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads') @click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
@click.option('-d', '--delimiter', default=",") @click.option('-d', '--delimiter', default=",")
@click.option('-n', '--nocache', is_flag=True, default=False, help='Do not cache results')
@click.option('--skip', help='Discard the first DISCARD lines (use them as a header)', default=0) @click.option('--skip', help='Discard the first DISCARD lines (use them as a header)', default=0)
@click.option('--commentchar', help='Lines starting with this character will be ignored', default=None) @click.option('--commentchar', help='Lines starting with this character will be ignored', default=None)
@click.option('-q', '--quotechar', default='"') @click.option('-q', '--quotechar', default='"')
@click.option('-c', '--column', type=int, default=0) @click.option('-c', '--column', type=int, default=0)
@serialize @serialize
@click.pass_context @click.pass_context
def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, skip, quotechar, commentchar, column): def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, nocache, skip, quotechar, commentchar, column):
if update and not click.confirm('This may overwrite existing tweets. Continue?'): if update and not click.confirm('This may overwrite existing tweets. Continue?'):
click.echo('Cancelling') click.echo('Cancelling')
return return
@ -204,10 +183,9 @@ def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, skip, quotecha
status = tqdm('Queried') status = tqdm('Queried')
failed = 0 failed = 0
for tid, obj in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter, for tid, obj in utils.download_tweets_file(wq, tweetsfile, folder, delimiter=delimiter, cache=not nocache,
batch_method=utils.tweet_download_batch, skip=skip, quotechar=quotechar, commentchar=commentchar,
skip=skip, quotechar=quotechar, commentchar=commentchar, column=column, update=update, retry_failed=retry):
column=column, update=update, retry_failed=retry):
status.update(1) status.update(1)
if not obj: if not obj:
failed += 1 failed += 1
@ -264,6 +242,7 @@ def get_user(user, dry_run, folder, update):
@click.option('-f', '--folder', default="users") @click.option('-f', '--folder', default="users")
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!') @click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads') @click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
@click.option('-n', '--nocache', is_flag=True, default=False, help='Do not cache results')
@click.option('-d', '--delimiter', default=",") @click.option('-d', '--delimiter', default=",")
@click.option('--skip', help='Discard the first SKIP lines (e.g., use them as a header)', @click.option('--skip', help='Discard the first SKIP lines (e.g., use them as a header)',
is_flag=True, default=False) is_flag=True, default=False)
@ -272,17 +251,17 @@ def get_user(user, dry_run, folder, update):
@click.option('-c', '--column', type=int, default=0) @click.option('-c', '--column', type=int, default=0)
@serialize @serialize
@click.pass_context @click.pass_context
def get_users(ctx, usersfile, folder, update, retry, delimiter, skip, quotechar, commentchar, column): def get_users(ctx, usersfile, folder, update, retry, nocache, delimiter, skip, quotechar, commentchar, column):
if update and not click.confirm('This may overwrite existing users. Continue?'): if update and not click.confirm('This may overwrite existing users. Continue?'):
click.echo('Cancelling') click.echo('Cancelling')
return return
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter, for i in utils.download_users_file(wq, usersfile, folder, delimiter=delimiter,
batch_method=utils.user_download_batch, update=update, retry_failed=retry,
update=update, retry_failed=retry, skip=skip, quotechar=quotechar,
skip=skip, quotechar=quotechar, cache=not nocache,
commentchar=commentchar, commentchar=commentchar,
column=column): column=column):
yield i yield i
@users.command('crawl') @users.command('crawl')
@ -480,7 +459,6 @@ def extract(ctx, recursive, user, name, initfile):
@extractor.command('reset') @extractor.command('reset')
@click.pass_context @click.pass_context
def reset_extractor(ctx): def reset_extractor(ctx):
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
db = ctx.obj['DBURI'] db = ctx.obj['DBURI']
session = make_session(db) session = make_session(db)
session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False}) session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})

View File

@ -19,7 +19,9 @@ import queue
import threading import threading
from select import select from select import select
from functools import partial import operator
from functools import partial, reduce
from tqdm import tqdm from tqdm import tqdm
@ -473,22 +475,22 @@ def get_user(c, user):
except ValueError: except ValueError:
return c.users.lookup(screen_name=user)[0] return c.users.lookup(screen_name=user)[0]
def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False): def download_tweet(wq, tweetid, cache=True, folder="downloaded_tweets", update=False):
tweet = cached_id(tweetid, folder) tweet = cached_id(tweetid, folder)
if update or not tweet: if update or not tweet:
tweet = get_tweet(wq, tweetid) tweet = get_tweet(wq, tweetid)
if write and update: if cache and update:
if tweet: if tweet:
js = json.dumps(tweet) js = json.dumps(tweet)
write_json(js, folder) write_json(js, folder)
yield tweet yield tweet
def download_user(wq, userid, write=True, folder="downloaded_users", update=False): def download_user(wq, userid, cache=True, folder="downloaded_users", update=False):
user = cached_id(userid, folder) user = cached_id(userid, folder)
if update or not user: if update or not user:
user = get_user(wq, userid) user = get_user(wq, userid)
if write and update: if cache and update:
if user: if user:
write_json(user, folder, aliases=[user['screen_name'], ]) write_json(user, folder, aliases=[user['screen_name'], ])
yield user yield user
@ -589,7 +591,8 @@ def dump_result(oid, obj, folder, ignore_fails=True):
with open(fail_file(oid, folder), 'w') as f: with open(fail_file(oid, folder), 'w') as f:
print('Object not found', file=f) print('Object not found', file=f)
def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=False,
def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=False, cache=True,
batch_method=tweet_download_batch): batch_method=tweet_download_batch):
done = Queue() done = Queue()
@ -647,13 +650,24 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail
break break
oid, obj = rec oid, obj = rec
dump_result(oid, obj, folder, ignore_fails) if cache or (not obj):
dump_result(oid, obj, folder, ignore_fails)
yield rec yield rec
wait.join() wait.join()
def download_file(wq, csvfile, folder, column=0, delimiter=',', skip=0, def download_tweets_file(*args, **kwargs):
kwargs['batch_method'] = tweet_download_batch
yield from download_file(*args, **kwargs)
def download_users_file(*args, **kwargs):
kwargs['batch_method'] = user_download_batch
yield from download_file(*args, **kwargs)
def download_file(wq, csvfile, folder, column=0, delimiter=',', skip=0, cache=True,
quotechar='"', commentchar=None, batch_method=tweet_download_batch, quotechar='"', commentchar=None, batch_method=tweet_download_batch,
**kwargs): **kwargs):
with open(csvfile) as f: with open(csvfile) as f:
@ -670,7 +684,7 @@ def download_file(wq, csvfile, folder, column=0, delimiter=',', skip=0,
yield row[column].strip() yield row[column].strip()
for res in download_list(wq, reader(csvreader), folder, batch_method=batch_method, for res in download_list(wq, reader(csvreader), folder, batch_method=batch_method, cache=cache,
**kwargs): **kwargs):
yield res yield res
@ -748,3 +762,42 @@ def _users_control(func, apiargs, remaining=0, **kwargs):
if int(cursor) != -1: if int(cursor) != -1:
stop = False stop = False
return resp['users'], stop return resp['users'], stop
def serialized(it, outfile, outformat='csv', fields=[], header=None, ignore_missing=False, delimiter='\t'):
outformat = outformat.lower()
def do(out):
if outformat == 'csv':
writer = csv.writer(out, quoting=csv.QUOTE_ALL, delimiter=delimiter)
if header != '':
h = header
if h is None:
h = delimiter.join(fields)
print(h, file=out)
attrs = list(token.strip().split('.') for token in fields)
for obj in it:
values = []
for attr in attrs:
try:
values.append(reduce(operator.getitem, attr, obj))
except KeyError:
if not ignore_missing:
print('Key not present: {}'.format(attr), file=sys.stderr)
values.append(None)
writer.writerow(values)
elif outformat == 'jsonlines':
for obj in it:
print(json.dumps(obj, sort_keys=True), file=out)
elif outformat == 'indented':
for obj in it:
print(json.dumps(obj, indent=4, sort_keys=True), file=out)
else:
for obj in it:
print(obj, file=out)
if outfile is sys.stdout:
return do(sys.stdout)
with open(outfile, 'w') as out:
return do(out)