@ -8,8 +8,6 @@ import time
import sqlalchemy . types
import threading
import sqlite3
import operator
from functools import reduce
from tqdm import tqdm
from sqlalchemy import exists
@ -19,7 +17,6 @@ from bitter import config as bconf
from bitter . models import make_session , User , ExtractorEntry , Following
import sys
import csv as tsv
if sys . version_info < = ( 3 , 0 ) :
from contextlib2 import ExitStack
else :
@ -29,48 +26,29 @@ else:
logger = logging . getLogger ( __name__ )
def serialize ( function ) :
''' Common options to serialize output to CSV or other formats '''
@click.option ( ' --csv ' , help = ' Print each object as a csv row. Provide a list of comma-separated fields to print. ' , default = ' ' , type = str )
@click.option ( ' --fields ' , help = ' Provide a list of comma-separated fields to print. ' , default = ' ' , type = str )
@click.option ( ' --ignore_missing ' , help = ' Do not show warnings for missing fields. ' , is_flag = True )
@click.option ( ' --header ' , help = ' Header that will be printed at the beginning of the file ' , default = None )
@click.option ( ' --csv ' , help = ' Print each object as a csv row. ' , is_flag = True )
@click.option ( ' --jsonlines ' , ' --json ' , help = ' Print each object as JSON in a new line. ' , is_flag = True )
@click.option ( ' --indented ' , help = ' Print each object as an indented JSON object ' , is_flag = True )
@click.option ( ' --outdelimiter ' , help = ' Delimiter for some output formats, such as CSV. It defaults to \t ' , default = ' \t ' )
@click.option ( ' --outfile ' , help = ' Output file. It defaults to STDOUT ' , default = sys . stdout )
def decorated ( csv , header , jsonlines , indented , outfile , * * kwargs ) :
if header :
print ( header )
def decorated ( fields , ignore_missing , header , csv , jsonlines , indented , outfile , outdelimiter , * * kwargs ) :
it = function ( * * kwargs )
outformat = ' json '
if csv :
outformat = ' csv '
elif jsonlines :
outformat = ' jsonlines '
elif indented :
outformat = ' indented '
def do ( out ) :
if csv :
delimiter = ' \t '
writer = tsv . writer ( out , quoting = tsv . QUOTE_ALL , delimiter = delimiter )
if header is None :
# Print fields as header unless told otherwise
print ( csv . replace ( ' , ' , delimiter ) , file = out )
fields = list ( token . strip ( ) . split ( ' . ' ) for token in csv . split ( ' , ' ) )
for obj in it :
writer . writerow ( list ( reduce ( operator . getitem , field , obj ) for field in fields ) )
elif jsonlines :
for obj in it :
print ( json . dumps ( obj , sort_keys = True ) , file = out )
elif indented :
for obj in it :
print ( json . dumps ( obj , indent = 4 , sort_keys = True ) , file = out )
else :
for obj in it :
print ( obj , file = out )
if outfile is sys . stdout :
return do ( sys . stdout )
return utils . serialized ( it , outfile , outformat = outformat , fields = fields . split ( ' , ' ) , ignore_missing = ignore_missing , header = header , delimiter = outdelimiter )
with open ( outfile , ' w ' ) as out :
return do ( out )
return decorated
@ -190,13 +168,14 @@ The result is stored as individual json files in your folder of choice.''')
@click.option ( ' -u ' , ' --update ' , is_flag = True , default = False , help = ' Download tweet even if it is already present. WARNING: it will overwrite existing files! ' )
@click.option ( ' -r ' , ' --retry ' , is_flag = True , default = False , help = ' Retry failed downloads ' )
@click.option ( ' -d ' , ' --delimiter ' , default = " , " )
@click.option ( ' -n ' , ' --nocache ' , is_flag = True , default = False , help = ' Do not cache results ' )
@click.option ( ' --skip ' , help = ' Discard the first DISCARD lines (use them as a header) ' , default = 0 )
@click.option ( ' --commentchar ' , help = ' Lines starting with this character will be ignored ' , default = None )
@click.option ( ' -q ' , ' --quotechar ' , default = ' " ' )
@click.option ( ' -c ' , ' --column ' , type = int , default = 0 )
@serialize
@click.pass_context
def get_tweets ( ctx , tweetsfile , folder , update , retry , delimiter , skip, quotechar , commentchar , column ) :
def get_tweets ( ctx , tweetsfile , folder , update , retry , delimiter , nocache, skip, quotechar , commentchar , column ) :
if update and not click . confirm ( ' This may overwrite existing tweets. Continue? ' ) :
click . echo ( ' Cancelling ' )
return
@ -204,10 +183,9 @@ def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, skip, quotecha
status = tqdm ( ' Queried ' )
failed = 0
for tid , obj in utils . download_file ( wq , tweetsfile , folder , delimiter = delimiter ,
batch_method = utils . tweet_download_batch ,
skip = skip , quotechar = quotechar , commentchar = commentchar ,
column = column , update = update , retry_failed = retry ) :
for tid , obj in utils . download_tweets_file ( wq , tweetsfile , folder , delimiter = delimiter , cache = not nocache ,
skip = skip , quotechar = quotechar , commentchar = commentchar ,
column = column , update = update , retry_failed = retry ) :
status . update ( 1 )
if not obj :
failed + = 1
@ -264,6 +242,7 @@ def get_user(user, dry_run, folder, update):
@click.option ( ' -f ' , ' --folder ' , default = " users " )
@click.option ( ' -u ' , ' --update ' , is_flag = True , default = False , help = ' Download user even if it is already present. WARNING: it will overwrite existing files! ' )
@click.option ( ' -r ' , ' --retry ' , is_flag = True , default = False , help = ' Retry failed downloads ' )
@click.option ( ' -n ' , ' --nocache ' , is_flag = True , default = False , help = ' Do not cache results ' )
@click.option ( ' -d ' , ' --delimiter ' , default = " , " )
@click.option ( ' --skip ' , help = ' Discard the first SKIP lines (e.g., use them as a header) ' ,
is_flag = True , default = False )
@ -272,17 +251,17 @@ def get_user(user, dry_run, folder, update):
@click.option ( ' -c ' , ' --column ' , type = int , default = 0 )
@serialize
@click.pass_context
def get_users ( ctx , usersfile , folder , update , retry , delimiter, skip , quotechar , commentchar , column ) :
def get_users ( ctx , usersfile , folder , update , retry , nocache, delimiter, skip , quotechar , commentchar , column ) :
if update and not click . confirm ( ' This may overwrite existing users. Continue? ' ) :
click . echo ( ' Cancelling ' )
return
wq = crawlers . TwitterQueue . from_config ( conffile = bconf . CONFIG_FILE )
for i in utils . download_ file( wq , usersfile , folder , delimiter = delimiter ,
batch_method = utils . user_download_batch ,
update = update , retry_failed = retry ,
skip = skip , quotechar = quotechar ,
commentchar = commentchar ,
column = column ) :
for i in utils . download_ users_ file( wq , usersfile , folder , delimiter = delimiter ,
update = update , retry_failed = retry ,
skip = skip , quotechar = quotechar ,
cache = not nocache ,
commentchar = commentchar ,
column = column ) :
yield i
@users.command ( ' crawl ' )
@ -480,7 +459,6 @@ def extract(ctx, recursive, user, name, initfile):
@extractor.command ( ' reset ' )
@click.pass_context
def reset_extractor ( ctx ) :
wq = crawlers . TwitterQueue . from_config ( conffile = bconf . CONFIG_FILE )
db = ctx . obj [ ' DBURI ' ]
session = make_session ( db )
session . query ( ExtractorEntry ) . filter ( ExtractorEntry . pending == True ) . update ( { ' pending ' : False } )