mirror of
https://github.com/balkian/bitter.git
synced 2024-12-22 00:18:12 +00:00
Version 0.10.3
* Ability to include "optional" fields from tweets (e.g., retweeted_status). * Optional caching (for very large datasets)
This commit is contained in:
parent
030c41b826
commit
ea848f1a78
@ -1 +1 @@
|
|||||||
0.10.2
|
0.10.3
|
||||||
|
@ -6,10 +6,12 @@ http://github.com/balkian/bitter
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from .version import __version__
|
from .version import __version__
|
||||||
|
from . import config as bconf
|
||||||
|
|
||||||
def easy(*args, **kwargs):
|
def easy(conffile=bconf.CONFIG_FILE):
|
||||||
from .crawlers import TwitterQueue
|
from .crawlers import TwitterQueue
|
||||||
return TwitterQueue.from_credentials(*args, **kwargs)
|
|
||||||
|
return TwitterQueue.from_config(conffile=conffile)
|
||||||
|
|
||||||
__all__ = ['cli', 'config', 'crawlers', 'models', 'utils' ]
|
__all__ = ['cli', 'config', 'crawlers', 'models', 'utils' ]
|
||||||
|
|
||||||
|
@ -8,8 +8,6 @@ import time
|
|||||||
import sqlalchemy.types
|
import sqlalchemy.types
|
||||||
import threading
|
import threading
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import operator
|
|
||||||
from functools import reduce
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from sqlalchemy import exists
|
from sqlalchemy import exists
|
||||||
@ -19,7 +17,6 @@ from bitter import config as bconf
|
|||||||
from bitter.models import make_session, User, ExtractorEntry, Following
|
from bitter.models import make_session, User, ExtractorEntry, Following
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import csv as tsv
|
|
||||||
if sys.version_info <= (3, 0):
|
if sys.version_info <= (3, 0):
|
||||||
from contextlib2 import ExitStack
|
from contextlib2 import ExitStack
|
||||||
else:
|
else:
|
||||||
@ -29,48 +26,29 @@ else:
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def serialize(function):
|
def serialize(function):
|
||||||
'''Common options to serialize output to CSV or other formats'''
|
'''Common options to serialize output to CSV or other formats'''
|
||||||
|
|
||||||
@click.option('--csv', help='Print each object as a csv row. Provide a list of comma-separated fields to print.', default='', type=str)
|
@click.option('--fields', help='Provide a list of comma-separated fields to print.', default='', type=str)
|
||||||
|
@click.option('--ignore_missing', help='Do not show warnings for missing fields.', is_flag=True)
|
||||||
@click.option('--header', help='Header that will be printed at the beginning of the file', default=None)
|
@click.option('--header', help='Header that will be printed at the beginning of the file', default=None)
|
||||||
|
@click.option('--csv', help='Print each object as a csv row.', is_flag=True)
|
||||||
@click.option('--jsonlines', '--json', help='Print each object as JSON in a new line.', is_flag=True)
|
@click.option('--jsonlines', '--json', help='Print each object as JSON in a new line.', is_flag=True)
|
||||||
@click.option('--indented', help='Print each object as an indented JSON object', is_flag=True)
|
@click.option('--indented', help='Print each object as an indented JSON object', is_flag=True)
|
||||||
|
@click.option('--outdelimiter', help='Delimiter for some output formats, such as CSV. It defaults to \t', default='\t')
|
||||||
@click.option('--outfile', help='Output file. It defaults to STDOUT', default=sys.stdout)
|
@click.option('--outfile', help='Output file. It defaults to STDOUT', default=sys.stdout)
|
||||||
def decorated(csv, header, jsonlines, indented, outfile, **kwargs):
|
def decorated(fields, ignore_missing, header, csv, jsonlines, indented, outfile, outdelimiter, **kwargs):
|
||||||
if header:
|
|
||||||
print(header)
|
|
||||||
|
|
||||||
it = function(**kwargs)
|
it = function(**kwargs)
|
||||||
|
outformat = 'json'
|
||||||
|
if csv:
|
||||||
|
outformat = 'csv'
|
||||||
|
elif jsonlines:
|
||||||
|
outformat = 'jsonlines'
|
||||||
|
elif indented:
|
||||||
|
outformat = 'indented'
|
||||||
|
|
||||||
def do(out):
|
return utils.serialized(it, outfile, outformat=outformat, fields=fields.split(','), ignore_missing=ignore_missing, header=header, delimiter=outdelimiter)
|
||||||
|
|
||||||
if csv:
|
|
||||||
delimiter = '\t'
|
|
||||||
writer = tsv.writer(out, quoting=tsv.QUOTE_ALL, delimiter=delimiter)
|
|
||||||
if header is None:
|
|
||||||
# Print fields as header unless told otherwise
|
|
||||||
print(csv.replace(',', delimiter), file=out)
|
|
||||||
fields = list(token.strip().split('.') for token in csv.split(','))
|
|
||||||
for obj in it:
|
|
||||||
writer.writerow(list(reduce(operator.getitem, field, obj) for field in fields))
|
|
||||||
elif jsonlines:
|
|
||||||
for obj in it:
|
|
||||||
print(json.dumps(obj, sort_keys=True), file=out)
|
|
||||||
elif indented:
|
|
||||||
for obj in it:
|
|
||||||
print(json.dumps(obj, indent=4, sort_keys=True), file=out)
|
|
||||||
else:
|
|
||||||
for obj in it:
|
|
||||||
print(obj, file=out)
|
|
||||||
|
|
||||||
if outfile is sys.stdout:
|
|
||||||
return do(sys.stdout)
|
|
||||||
|
|
||||||
with open(outfile, 'w') as out:
|
|
||||||
return do(out)
|
|
||||||
return decorated
|
return decorated
|
||||||
|
|
||||||
|
|
||||||
@ -190,13 +168,14 @@ The result is stored as individual json files in your folder of choice.''')
|
|||||||
@click.option('-u', '--update', is_flag=True, default=False, help='Download tweet even if it is already present. WARNING: it will overwrite existing files!')
|
@click.option('-u', '--update', is_flag=True, default=False, help='Download tweet even if it is already present. WARNING: it will overwrite existing files!')
|
||||||
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
|
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
|
||||||
@click.option('-d', '--delimiter', default=",")
|
@click.option('-d', '--delimiter', default=",")
|
||||||
|
@click.option('-n', '--nocache', is_flag=True, default=False, help='Do not cache results')
|
||||||
@click.option('--skip', help='Discard the first DISCARD lines (use them as a header)', default=0)
|
@click.option('--skip', help='Discard the first DISCARD lines (use them as a header)', default=0)
|
||||||
@click.option('--commentchar', help='Lines starting with this character will be ignored', default=None)
|
@click.option('--commentchar', help='Lines starting with this character will be ignored', default=None)
|
||||||
@click.option('-q', '--quotechar', default='"')
|
@click.option('-q', '--quotechar', default='"')
|
||||||
@click.option('-c', '--column', type=int, default=0)
|
@click.option('-c', '--column', type=int, default=0)
|
||||||
@serialize
|
@serialize
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, skip, quotechar, commentchar, column):
|
def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, nocache, skip, quotechar, commentchar, column):
|
||||||
if update and not click.confirm('This may overwrite existing tweets. Continue?'):
|
if update and not click.confirm('This may overwrite existing tweets. Continue?'):
|
||||||
click.echo('Cancelling')
|
click.echo('Cancelling')
|
||||||
return
|
return
|
||||||
@ -204,10 +183,9 @@ def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, skip, quotecha
|
|||||||
|
|
||||||
status = tqdm('Queried')
|
status = tqdm('Queried')
|
||||||
failed = 0
|
failed = 0
|
||||||
for tid, obj in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter,
|
for tid, obj in utils.download_tweets_file(wq, tweetsfile, folder, delimiter=delimiter, cache=not nocache,
|
||||||
batch_method=utils.tweet_download_batch,
|
skip=skip, quotechar=quotechar, commentchar=commentchar,
|
||||||
skip=skip, quotechar=quotechar, commentchar=commentchar,
|
column=column, update=update, retry_failed=retry):
|
||||||
column=column, update=update, retry_failed=retry):
|
|
||||||
status.update(1)
|
status.update(1)
|
||||||
if not obj:
|
if not obj:
|
||||||
failed += 1
|
failed += 1
|
||||||
@ -264,6 +242,7 @@ def get_user(user, dry_run, folder, update):
|
|||||||
@click.option('-f', '--folder', default="users")
|
@click.option('-f', '--folder', default="users")
|
||||||
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
|
@click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!')
|
||||||
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
|
@click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads')
|
||||||
|
@click.option('-n', '--nocache', is_flag=True, default=False, help='Do not cache results')
|
||||||
@click.option('-d', '--delimiter', default=",")
|
@click.option('-d', '--delimiter', default=",")
|
||||||
@click.option('--skip', help='Discard the first SKIP lines (e.g., use them as a header)',
|
@click.option('--skip', help='Discard the first SKIP lines (e.g., use them as a header)',
|
||||||
is_flag=True, default=False)
|
is_flag=True, default=False)
|
||||||
@ -272,17 +251,17 @@ def get_user(user, dry_run, folder, update):
|
|||||||
@click.option('-c', '--column', type=int, default=0)
|
@click.option('-c', '--column', type=int, default=0)
|
||||||
@serialize
|
@serialize
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def get_users(ctx, usersfile, folder, update, retry, delimiter, skip, quotechar, commentchar, column):
|
def get_users(ctx, usersfile, folder, update, retry, nocache, delimiter, skip, quotechar, commentchar, column):
|
||||||
if update and not click.confirm('This may overwrite existing users. Continue?'):
|
if update and not click.confirm('This may overwrite existing users. Continue?'):
|
||||||
click.echo('Cancelling')
|
click.echo('Cancelling')
|
||||||
return
|
return
|
||||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
||||||
for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter,
|
for i in utils.download_users_file(wq, usersfile, folder, delimiter=delimiter,
|
||||||
batch_method=utils.user_download_batch,
|
update=update, retry_failed=retry,
|
||||||
update=update, retry_failed=retry,
|
skip=skip, quotechar=quotechar,
|
||||||
skip=skip, quotechar=quotechar,
|
cache=not nocache,
|
||||||
commentchar=commentchar,
|
commentchar=commentchar,
|
||||||
column=column):
|
column=column):
|
||||||
yield i
|
yield i
|
||||||
|
|
||||||
@users.command('crawl')
|
@users.command('crawl')
|
||||||
@ -480,7 +459,6 @@ def extract(ctx, recursive, user, name, initfile):
|
|||||||
@extractor.command('reset')
|
@extractor.command('reset')
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def reset_extractor(ctx):
|
def reset_extractor(ctx):
|
||||||
wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE)
|
|
||||||
db = ctx.obj['DBURI']
|
db = ctx.obj['DBURI']
|
||||||
session = make_session(db)
|
session = make_session(db)
|
||||||
session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})
|
session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})
|
||||||
|
@ -19,7 +19,9 @@ import queue
|
|||||||
import threading
|
import threading
|
||||||
from select import select
|
from select import select
|
||||||
|
|
||||||
from functools import partial
|
import operator
|
||||||
|
|
||||||
|
from functools import partial, reduce
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
@ -473,22 +475,22 @@ def get_user(c, user):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
return c.users.lookup(screen_name=user)[0]
|
return c.users.lookup(screen_name=user)[0]
|
||||||
|
|
||||||
def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False):
|
def download_tweet(wq, tweetid, cache=True, folder="downloaded_tweets", update=False):
|
||||||
tweet = cached_id(tweetid, folder)
|
tweet = cached_id(tweetid, folder)
|
||||||
if update or not tweet:
|
if update or not tweet:
|
||||||
tweet = get_tweet(wq, tweetid)
|
tweet = get_tweet(wq, tweetid)
|
||||||
if write and update:
|
if cache and update:
|
||||||
if tweet:
|
if tweet:
|
||||||
js = json.dumps(tweet)
|
js = json.dumps(tweet)
|
||||||
write_json(js, folder)
|
write_json(js, folder)
|
||||||
yield tweet
|
yield tweet
|
||||||
|
|
||||||
|
|
||||||
def download_user(wq, userid, write=True, folder="downloaded_users", update=False):
|
def download_user(wq, userid, cache=True, folder="downloaded_users", update=False):
|
||||||
user = cached_id(userid, folder)
|
user = cached_id(userid, folder)
|
||||||
if update or not user:
|
if update or not user:
|
||||||
user = get_user(wq, userid)
|
user = get_user(wq, userid)
|
||||||
if write and update:
|
if cache and update:
|
||||||
if user:
|
if user:
|
||||||
write_json(user, folder, aliases=[user['screen_name'], ])
|
write_json(user, folder, aliases=[user['screen_name'], ])
|
||||||
yield user
|
yield user
|
||||||
@ -589,7 +591,8 @@ def dump_result(oid, obj, folder, ignore_fails=True):
|
|||||||
with open(fail_file(oid, folder), 'w') as f:
|
with open(fail_file(oid, folder), 'w') as f:
|
||||||
print('Object not found', file=f)
|
print('Object not found', file=f)
|
||||||
|
|
||||||
def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=False,
|
|
||||||
|
def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=False, cache=True,
|
||||||
batch_method=tweet_download_batch):
|
batch_method=tweet_download_batch):
|
||||||
|
|
||||||
done = Queue()
|
done = Queue()
|
||||||
@ -647,13 +650,24 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail
|
|||||||
break
|
break
|
||||||
|
|
||||||
oid, obj = rec
|
oid, obj = rec
|
||||||
dump_result(oid, obj, folder, ignore_fails)
|
if cache or (not obj):
|
||||||
|
dump_result(oid, obj, folder, ignore_fails)
|
||||||
yield rec
|
yield rec
|
||||||
|
|
||||||
wait.join()
|
wait.join()
|
||||||
|
|
||||||
|
|
||||||
def download_file(wq, csvfile, folder, column=0, delimiter=',', skip=0,
|
def download_tweets_file(*args, **kwargs):
|
||||||
|
kwargs['batch_method'] = tweet_download_batch
|
||||||
|
yield from download_file(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def download_users_file(*args, **kwargs):
|
||||||
|
kwargs['batch_method'] = user_download_batch
|
||||||
|
yield from download_file(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(wq, csvfile, folder, column=0, delimiter=',', skip=0, cache=True,
|
||||||
quotechar='"', commentchar=None, batch_method=tweet_download_batch,
|
quotechar='"', commentchar=None, batch_method=tweet_download_batch,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
with open(csvfile) as f:
|
with open(csvfile) as f:
|
||||||
@ -670,7 +684,7 @@ def download_file(wq, csvfile, folder, column=0, delimiter=',', skip=0,
|
|||||||
yield row[column].strip()
|
yield row[column].strip()
|
||||||
|
|
||||||
|
|
||||||
for res in download_list(wq, reader(csvreader), folder, batch_method=batch_method,
|
for res in download_list(wq, reader(csvreader), folder, batch_method=batch_method, cache=cache,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
yield res
|
yield res
|
||||||
|
|
||||||
@ -748,3 +762,42 @@ def _users_control(func, apiargs, remaining=0, **kwargs):
|
|||||||
if int(cursor) != -1:
|
if int(cursor) != -1:
|
||||||
stop = False
|
stop = False
|
||||||
return resp['users'], stop
|
return resp['users'], stop
|
||||||
|
|
||||||
|
|
||||||
|
def serialized(it, outfile, outformat='csv', fields=[], header=None, ignore_missing=False, delimiter='\t'):
|
||||||
|
outformat = outformat.lower()
|
||||||
|
def do(out):
|
||||||
|
|
||||||
|
if outformat == 'csv':
|
||||||
|
writer = csv.writer(out, quoting=csv.QUOTE_ALL, delimiter=delimiter)
|
||||||
|
if header != '':
|
||||||
|
h = header
|
||||||
|
if h is None:
|
||||||
|
h = delimiter.join(fields)
|
||||||
|
print(h, file=out)
|
||||||
|
attrs = list(token.strip().split('.') for token in fields)
|
||||||
|
for obj in it:
|
||||||
|
values = []
|
||||||
|
for attr in attrs:
|
||||||
|
try:
|
||||||
|
values.append(reduce(operator.getitem, attr, obj))
|
||||||
|
except KeyError:
|
||||||
|
if not ignore_missing:
|
||||||
|
print('Key not present: {}'.format(attr), file=sys.stderr)
|
||||||
|
values.append(None)
|
||||||
|
writer.writerow(values)
|
||||||
|
elif outformat == 'jsonlines':
|
||||||
|
for obj in it:
|
||||||
|
print(json.dumps(obj, sort_keys=True), file=out)
|
||||||
|
elif outformat == 'indented':
|
||||||
|
for obj in it:
|
||||||
|
print(json.dumps(obj, indent=4, sort_keys=True), file=out)
|
||||||
|
else:
|
||||||
|
for obj in it:
|
||||||
|
print(obj, file=out)
|
||||||
|
|
||||||
|
if outfile is sys.stdout:
|
||||||
|
return do(sys.stdout)
|
||||||
|
|
||||||
|
with open(outfile, 'w') as out:
|
||||||
|
return do(out)
|
||||||
|
Loading…
Reference in New Issue
Block a user