@ -8,8 +8,6 @@ import time
import sqlalchemy . types
import sqlalchemy . types
import threading
import threading
import sqlite3
import sqlite3
import operator
from functools import reduce
from tqdm import tqdm
from tqdm import tqdm
from sqlalchemy import exists
from sqlalchemy import exists
@ -19,7 +17,6 @@ from bitter import config as bconf
from bitter . models import make_session , User , ExtractorEntry , Following
from bitter . models import make_session , User , ExtractorEntry , Following
import sys
import sys
import csv as tsv
if sys . version_info < = ( 3 , 0 ) :
if sys . version_info < = ( 3 , 0 ) :
from contextlib2 import ExitStack
from contextlib2 import ExitStack
else :
else :
@ -29,48 +26,29 @@ else:
logger = logging . getLogger ( __name__ )
logger = logging . getLogger ( __name__ )
def serialize ( function ) :
def serialize ( function ) :
''' Common options to serialize output to CSV or other formats '''
''' Common options to serialize output to CSV or other formats '''
@click.option ( ' --csv ' , help = ' Print each object as a csv row. Provide a list of comma-separated fields to print. ' , default = ' ' , type = str )
@click.option ( ' --fields ' , help = ' Provide a list of comma-separated fields to print. ' , default = ' ' , type = str )
@click.option ( ' --ignore_missing ' , help = ' Do not show warnings for missing fields. ' , is_flag = True )
@click.option ( ' --header ' , help = ' Header that will be printed at the beginning of the file ' , default = None )
@click.option ( ' --header ' , help = ' Header that will be printed at the beginning of the file ' , default = None )
@click.option ( ' --csv ' , help = ' Print each object as a csv row. ' , is_flag = True )
@click.option ( ' --jsonlines ' , ' --json ' , help = ' Print each object as JSON in a new line. ' , is_flag = True )
@click.option ( ' --jsonlines ' , ' --json ' , help = ' Print each object as JSON in a new line. ' , is_flag = True )
@click.option ( ' --indented ' , help = ' Print each object as an indented JSON object ' , is_flag = True )
@click.option ( ' --indented ' , help = ' Print each object as an indented JSON object ' , is_flag = True )
@click.option ( ' --outdelimiter ' , help = ' Delimiter for some output formats, such as CSV. It defaults to \t ' , default = ' \t ' )
@click.option ( ' --outfile ' , help = ' Output file. It defaults to STDOUT ' , default = sys . stdout )
@click.option ( ' --outfile ' , help = ' Output file. It defaults to STDOUT ' , default = sys . stdout )
def decorated ( csv , header , jsonlines , indented , outfile , * * kwargs ) :
def decorated ( fields , ignore_missing , header , csv , jsonlines , indented , outfile , outdelimiter , * * kwargs ) :
if header :
print ( header )
it = function ( * * kwargs )
it = function ( * * kwargs )
outformat = ' json '
if csv :
outformat = ' csv '
elif jsonlines :
outformat = ' jsonlines '
elif indented :
outformat = ' indented '
def do ( out ) :
return utils . serialized ( it , outfile , outformat = outformat , fields = fields . split ( ' , ' ) , ignore_missing = ignore_missing , header = header , delimiter = outdelimiter )
if csv :
delimiter = ' \t '
writer = tsv . writer ( out , quoting = tsv . QUOTE_ALL , delimiter = delimiter )
if header is None :
# Print fields as header unless told otherwise
print ( csv . replace ( ' , ' , delimiter ) , file = out )
fields = list ( token . strip ( ) . split ( ' . ' ) for token in csv . split ( ' , ' ) )
for obj in it :
writer . writerow ( list ( reduce ( operator . getitem , field , obj ) for field in fields ) )
elif jsonlines :
for obj in it :
print ( json . dumps ( obj , sort_keys = True ) , file = out )
elif indented :
for obj in it :
print ( json . dumps ( obj , indent = 4 , sort_keys = True ) , file = out )
else :
for obj in it :
print ( obj , file = out )
if outfile is sys . stdout :
return do ( sys . stdout )
with open ( outfile , ' w ' ) as out :
return do ( out )
return decorated
return decorated
@ -190,13 +168,14 @@ The result is stored as individual json files in your folder of choice.''')
@click.option ( ' -u ' , ' --update ' , is_flag = True , default = False , help = ' Download tweet even if it is already present. WARNING: it will overwrite existing files! ' )
@click.option ( ' -u ' , ' --update ' , is_flag = True , default = False , help = ' Download tweet even if it is already present. WARNING: it will overwrite existing files! ' )
@click.option ( ' -r ' , ' --retry ' , is_flag = True , default = False , help = ' Retry failed downloads ' )
@click.option ( ' -r ' , ' --retry ' , is_flag = True , default = False , help = ' Retry failed downloads ' )
@click.option ( ' -d ' , ' --delimiter ' , default = " , " )
@click.option ( ' -d ' , ' --delimiter ' , default = " , " )
@click.option ( ' -n ' , ' --nocache ' , is_flag = True , default = False , help = ' Do not cache results ' )
@click.option ( ' --skip ' , help = ' Discard the first DISCARD lines (use them as a header) ' , default = 0 )
@click.option ( ' --skip ' , help = ' Discard the first DISCARD lines (use them as a header) ' , default = 0 )
@click.option ( ' --commentchar ' , help = ' Lines starting with this character will be ignored ' , default = None )
@click.option ( ' --commentchar ' , help = ' Lines starting with this character will be ignored ' , default = None )
@click.option ( ' -q ' , ' --quotechar ' , default = ' " ' )
@click.option ( ' -q ' , ' --quotechar ' , default = ' " ' )
@click.option ( ' -c ' , ' --column ' , type = int , default = 0 )
@click.option ( ' -c ' , ' --column ' , type = int , default = 0 )
@serialize
@serialize
@click.pass_context
@click.pass_context
def get_tweets ( ctx , tweetsfile , folder , update , retry , delimiter , skip, quotechar , commentchar , column ) :
def get_tweets ( ctx , tweetsfile , folder , update , retry , delimiter , nocache, skip, quotechar , commentchar , column ) :
if update and not click . confirm ( ' This may overwrite existing tweets. Continue? ' ) :
if update and not click . confirm ( ' This may overwrite existing tweets. Continue? ' ) :
click . echo ( ' Cancelling ' )
click . echo ( ' Cancelling ' )
return
return
@ -204,10 +183,9 @@ def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, skip, quotecha
status = tqdm ( ' Queried ' )
status = tqdm ( ' Queried ' )
failed = 0
failed = 0
for tid , obj in utils . download_file ( wq , tweetsfile , folder , delimiter = delimiter ,
for tid , obj in utils . download_tweets_file ( wq , tweetsfile , folder , delimiter = delimiter , cache = not nocache ,
batch_method = utils . tweet_download_batch ,
skip = skip , quotechar = quotechar , commentchar = commentchar ,
skip = skip , quotechar = quotechar , commentchar = commentchar ,
column = column , update = update , retry_failed = retry ) :
column = column , update = update , retry_failed = retry ) :
status . update ( 1 )
status . update ( 1 )
if not obj :
if not obj :
failed + = 1
failed + = 1
@ -264,6 +242,7 @@ def get_user(user, dry_run, folder, update):
@click.option ( ' -f ' , ' --folder ' , default = " users " )
@click.option ( ' -f ' , ' --folder ' , default = " users " )
@click.option ( ' -u ' , ' --update ' , is_flag = True , default = False , help = ' Download user even if it is already present. WARNING: it will overwrite existing files! ' )
@click.option ( ' -u ' , ' --update ' , is_flag = True , default = False , help = ' Download user even if it is already present. WARNING: it will overwrite existing files! ' )
@click.option ( ' -r ' , ' --retry ' , is_flag = True , default = False , help = ' Retry failed downloads ' )
@click.option ( ' -r ' , ' --retry ' , is_flag = True , default = False , help = ' Retry failed downloads ' )
@click.option ( ' -n ' , ' --nocache ' , is_flag = True , default = False , help = ' Do not cache results ' )
@click.option ( ' -d ' , ' --delimiter ' , default = " , " )
@click.option ( ' -d ' , ' --delimiter ' , default = " , " )
@click.option ( ' --skip ' , help = ' Discard the first SKIP lines (e.g., use them as a header) ' ,
@click.option ( ' --skip ' , help = ' Discard the first SKIP lines (e.g., use them as a header) ' ,
is_flag = True , default = False )
is_flag = True , default = False )
@ -272,17 +251,17 @@ def get_user(user, dry_run, folder, update):
@click.option ( ' -c ' , ' --column ' , type = int , default = 0 )
@click.option ( ' -c ' , ' --column ' , type = int , default = 0 )
@serialize
@serialize
@click.pass_context
@click.pass_context
def get_users ( ctx , usersfile , folder , update , retry , delimiter, skip , quotechar , commentchar , column ) :
def get_users ( ctx , usersfile , folder , update , retry , nocache, delimiter, skip , quotechar , commentchar , column ) :
if update and not click . confirm ( ' This may overwrite existing users. Continue? ' ) :
if update and not click . confirm ( ' This may overwrite existing users. Continue? ' ) :
click . echo ( ' Cancelling ' )
click . echo ( ' Cancelling ' )
return
return
wq = crawlers . TwitterQueue . from_config ( conffile = bconf . CONFIG_FILE )
wq = crawlers . TwitterQueue . from_config ( conffile = bconf . CONFIG_FILE )
for i in utils . download_ file( wq , usersfile , folder , delimiter = delimiter ,
for i in utils . download_ users_ file( wq , usersfile , folder , delimiter = delimiter ,
batch_method = utils . user_download_batch ,
update = update , retry_failed = retry ,
update = update , retry_failed = retry ,
skip = skip , quotechar = quotechar ,
skip = skip , quotechar = quotechar ,
cache = not nocache ,
commentchar = commentchar ,
commentchar = commentchar ,
column = column ) :
column = column ) :
yield i
yield i
@users.command ( ' crawl ' )
@users.command ( ' crawl ' )
@ -480,7 +459,6 @@ def extract(ctx, recursive, user, name, initfile):
@extractor.command ( ' reset ' )
@extractor.command ( ' reset ' )
@click.pass_context
@click.pass_context
def reset_extractor ( ctx ) :
def reset_extractor ( ctx ) :
wq = crawlers . TwitterQueue . from_config ( conffile = bconf . CONFIG_FILE )
db = ctx . obj [ ' DBURI ' ]
db = ctx . obj [ ' DBURI ' ]
session = make_session ( db )
session = make_session ( db )
session . query ( ExtractorEntry ) . filter ( ExtractorEntry . pending == True ) . update ( { ' pending ' : False } )
session . query ( ExtractorEntry ) . filter ( ExtractorEntry . pending == True ) . update ( { ' pending ' : False } )