2016-12-05 22:02:26 +00:00
|
|
|
from __future__ import print_function
|
|
|
|
|
2016-01-14 20:41:14 +00:00
|
|
|
import logging
|
|
|
|
import time
|
|
|
|
import json
|
2017-12-19 18:04:10 +00:00
|
|
|
import yaml
|
2018-03-20 12:29:18 +00:00
|
|
|
import csv
|
2017-12-19 18:04:10 +00:00
|
|
|
import io
|
2016-01-14 20:41:14 +00:00
|
|
|
|
|
|
|
import signal
|
|
|
|
import sys
|
|
|
|
import sqlalchemy
|
2016-09-14 17:53:56 +00:00
|
|
|
import os
|
2016-09-23 16:36:01 +00:00
|
|
|
import multiprocessing
|
|
|
|
from multiprocessing.pool import ThreadPool
|
2019-04-30 17:15:15 +00:00
|
|
|
from multiprocessing import Queue
|
|
|
|
|
|
|
|
import queue
|
|
|
|
import threading
|
|
|
|
from select import select
|
2016-01-14 20:41:14 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
from functools import partial
|
|
|
|
|
2016-12-05 22:02:26 +00:00
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
from itertools import islice, chain
|
2016-09-14 17:53:56 +00:00
|
|
|
from contextlib import contextmanager
|
2016-12-06 00:30:32 +00:00
|
|
|
|
2016-11-19 19:16:56 +00:00
|
|
|
from collections import Counter
|
2019-04-30 17:15:15 +00:00
|
|
|
from random import choice
|
2016-09-14 17:53:56 +00:00
|
|
|
|
2016-12-05 22:02:26 +00:00
|
|
|
from builtins import map, filter
|
|
|
|
|
2016-01-14 20:41:14 +00:00
|
|
|
from twitter import TwitterHTTPError
|
|
|
|
|
|
|
|
from bitter.models import Following, User, ExtractorEntry, make_session
|
|
|
|
|
2016-09-14 17:53:56 +00:00
|
|
|
from bitter import config
|
|
|
|
|
2017-12-20 15:51:53 +00:00
|
|
|
# Fix Python 2.x.
|
|
|
|
try:
|
|
|
|
UNICODE_EXISTS = bool(type(unicode))
|
|
|
|
except NameError:
|
|
|
|
unicode = lambda s: str(s)
|
|
|
|
|
2016-01-14 20:41:14 +00:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
def signal_handler(signal, frame):
|
|
|
|
logger.info('You pressed Ctrl+C!')
|
|
|
|
sys.exit(0)
|
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
|
2016-12-05 22:02:26 +00:00
|
|
|
def chunk(iterable, n):
|
|
|
|
it = iter(iterable)
|
|
|
|
return iter(lambda: tuple(islice(it, n)), ())
|
2016-09-23 16:36:01 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
|
2016-12-05 22:02:26 +00:00
|
|
|
def parallel(func, source, chunksize=1, numcpus=multiprocessing.cpu_count()):
|
|
|
|
source = chunk(source, chunksize)
|
|
|
|
p = ThreadPool(numcpus*2)
|
2019-04-30 17:15:15 +00:00
|
|
|
results = p.imap_unordered(func, source)
|
2017-11-30 15:49:42 +00:00
|
|
|
for i in chain.from_iterable(results):
|
2016-09-23 16:36:01 +00:00
|
|
|
yield i
|
2016-01-14 20:41:14 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
|
2017-12-19 18:04:10 +00:00
|
|
|
def get_config_path(conf=None):
|
|
|
|
if not conf:
|
|
|
|
if config.CONFIG_FILE:
|
|
|
|
conf = config.CONFIG_FILE
|
2016-09-14 17:53:56 +00:00
|
|
|
else:
|
2017-12-19 18:04:10 +00:00
|
|
|
raise Exception('No valid config file')
|
|
|
|
return os.path.expanduser(conf)
|
|
|
|
|
|
|
|
|
|
|
|
def copy_credentials_to_config(credfile, conffile=None):
|
|
|
|
p = get_config_path(credfile)
|
|
|
|
with open(p) as old:
|
|
|
|
for line in old:
|
|
|
|
cred = json.loads(line.strip())
|
|
|
|
add_credentials(conffile, **cred)
|
|
|
|
|
|
|
|
|
|
|
|
def save_config(conf, conffile=None):
|
|
|
|
with config(conffile) as c:
|
|
|
|
c.clear()
|
|
|
|
c.update(conf)
|
2016-09-14 17:53:56 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
|
2016-09-14 17:53:56 +00:00
|
|
|
@contextmanager
|
2017-12-19 18:04:10 +00:00
|
|
|
def config(conffile=None):
|
|
|
|
d = read_config(conffile)
|
|
|
|
try:
|
|
|
|
yield d
|
|
|
|
finally:
|
|
|
|
write_config(d, conffile)
|
|
|
|
|
|
|
|
|
|
|
|
def read_config(conffile):
|
|
|
|
p = conffile and get_config_path(conffile)
|
2018-03-19 13:36:05 +00:00
|
|
|
if p:
|
|
|
|
if not os.path.exists(p):
|
2018-03-20 12:29:18 +00:00
|
|
|
raise IOError('{} file does not exist.'.format(p))
|
2017-12-19 18:04:10 +00:00
|
|
|
f = open(p, 'r')
|
|
|
|
elif 'BITTER_CONFIG' not in os.environ:
|
|
|
|
raise Exception('No config file or BITTER_CONFIG env variable.')
|
|
|
|
else:
|
2017-12-20 15:51:53 +00:00
|
|
|
f = io.StringIO(unicode(os.environ.get('BITTER_CONFIG', "")).strip().replace('\\n', '\n'))
|
2017-12-19 18:04:10 +00:00
|
|
|
return yaml.load(f) or {'credentials': []}
|
|
|
|
|
|
|
|
|
|
|
|
def write_config(conf, conffile=None):
|
2018-03-20 12:29:18 +00:00
|
|
|
if not conf:
|
|
|
|
conf = {'credentials': []}
|
2017-12-19 18:04:10 +00:00
|
|
|
if conffile:
|
|
|
|
p = get_config_path(conffile)
|
|
|
|
with open(p, 'w') as f:
|
|
|
|
yaml.dump(conf, f)
|
|
|
|
else:
|
|
|
|
os.environ['BITTER_CONFIG'] = yaml.dump(conf)
|
2016-09-14 17:53:56 +00:00
|
|
|
|
2017-12-19 18:04:10 +00:00
|
|
|
def iter_credentials(conffile=None):
|
|
|
|
with config(conffile) as c:
|
|
|
|
for i in c['credentials']:
|
|
|
|
yield i
|
2017-11-30 15:49:42 +00:00
|
|
|
|
2016-09-14 17:53:56 +00:00
|
|
|
|
2017-12-19 18:04:10 +00:00
|
|
|
def create_config_file(conffile=None):
|
|
|
|
if not conffile:
|
|
|
|
return
|
|
|
|
conffile = get_config_path(conffile)
|
|
|
|
with open(conffile, 'a'):
|
|
|
|
pass
|
2018-03-20 12:29:18 +00:00
|
|
|
write_config(None, conffile)
|
2017-11-30 15:49:42 +00:00
|
|
|
|
2018-03-19 13:36:05 +00:00
|
|
|
|
2017-12-19 18:04:10 +00:00
|
|
|
def get_credentials(conffile=None, inverse=False, **kwargs):
|
2016-09-14 17:53:56 +00:00
|
|
|
creds = []
|
2017-12-19 18:04:10 +00:00
|
|
|
for i in iter_credentials(conffile):
|
2016-09-15 11:56:17 +00:00
|
|
|
matches = all(map(lambda x: i[x[0]] == x[1], kwargs.items()))
|
|
|
|
if matches and not inverse:
|
|
|
|
creds.append(i)
|
|
|
|
elif inverse and not matches:
|
2016-09-14 17:53:56 +00:00
|
|
|
creds.append(i)
|
|
|
|
return creds
|
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
|
2017-12-19 18:04:10 +00:00
|
|
|
def delete_credentials(conffile=None, **creds):
|
|
|
|
tokeep = get_credentials(conffile, inverse=True, **creds)
|
|
|
|
with config(conffile) as c:
|
|
|
|
c['credentials'] = list(tokeep)
|
2016-09-14 17:53:56 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
|
2017-12-19 18:04:10 +00:00
|
|
|
def add_credentials(conffile=None, **creds):
|
2018-03-20 12:29:18 +00:00
|
|
|
try:
|
|
|
|
exist = get_credentials(conffile, **creds)
|
|
|
|
except IOError:
|
|
|
|
exist = False
|
|
|
|
create_config_file(conffile)
|
2017-12-19 18:04:10 +00:00
|
|
|
if exist:
|
|
|
|
return
|
|
|
|
with config(conffile) as c:
|
|
|
|
c['credentials'].append(creds)
|
2016-09-14 17:53:56 +00:00
|
|
|
|
|
|
|
|
2016-11-19 19:16:56 +00:00
|
|
|
def get_hashtags(iter_tweets, best=None):
|
|
|
|
c = Counter()
|
|
|
|
for tweet in iter_tweets:
|
|
|
|
c.update(tag['text'] for tag in tweet.get('entities', {}).get('hashtags', {}))
|
|
|
|
return c
|
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
|
2016-11-19 19:16:56 +00:00
|
|
|
def read_file(filename, tail=False):
|
2017-12-28 17:13:23 +00:00
|
|
|
if filename == '-':
|
|
|
|
f = sys.stdin
|
|
|
|
else:
|
|
|
|
f = open(filename)
|
|
|
|
try:
|
|
|
|
while True:
|
|
|
|
line = f.readline()
|
|
|
|
if line not in (None, '', '\n'):
|
|
|
|
tweet = json.loads(line.strip())
|
|
|
|
yield tweet
|
|
|
|
else:
|
|
|
|
if tail:
|
|
|
|
time.sleep(1)
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
finally:
|
|
|
|
if f != sys.stdin:
|
|
|
|
close(f)
|
2017-11-30 15:49:42 +00:00
|
|
|
|
2016-11-19 19:16:56 +00:00
|
|
|
|
2016-01-14 20:41:14 +00:00
|
|
|
def get_users(wq, ulist, by_name=False, queue=None, max_users=100):
|
|
|
|
t = 'name' if by_name else 'uid'
|
|
|
|
logger.debug('Getting users by {}: {}'.format(t, ulist))
|
|
|
|
ilist = iter(ulist)
|
|
|
|
while True:
|
2016-09-14 17:53:56 +00:00
|
|
|
userslice = ",".join(str(i) for i in islice(ilist, max_users))
|
2016-01-14 20:41:14 +00:00
|
|
|
if not userslice:
|
|
|
|
break
|
|
|
|
try:
|
|
|
|
if by_name:
|
|
|
|
resp = wq.users.lookup(screen_name=userslice)
|
|
|
|
else:
|
|
|
|
resp = wq.users.lookup(user_id=userslice)
|
|
|
|
except TwitterHTTPError as ex:
|
|
|
|
if ex.e.code in (404,):
|
|
|
|
resp = []
|
|
|
|
else:
|
|
|
|
raise
|
|
|
|
if not resp:
|
|
|
|
logger.debug('Empty response')
|
|
|
|
for user in resp:
|
|
|
|
user = trim_user(user)
|
|
|
|
if queue:
|
|
|
|
queue.put(user)
|
|
|
|
else:
|
|
|
|
yield user
|
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
|
2016-01-14 20:41:14 +00:00
|
|
|
def trim_user(user):
|
|
|
|
if 'status' in user:
|
2016-09-14 17:53:56 +00:00
|
|
|
del user['status']
|
2016-01-14 20:41:14 +00:00
|
|
|
if 'follow_request_sent' in user:
|
|
|
|
del user['follow_request_sent']
|
|
|
|
if 'created_at' in user:
|
|
|
|
ts = time.strftime('%s', time.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))
|
|
|
|
user['created_at_stamp'] = ts
|
|
|
|
del user['created_at']
|
|
|
|
user['entities'] = json.dumps(user['entities'])
|
|
|
|
return user
|
|
|
|
|
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
def add_user(user, dburi=None, session=None, update=False):
|
|
|
|
if not session:
|
|
|
|
session = make_session(dburi)
|
|
|
|
|
2016-01-14 20:41:14 +00:00
|
|
|
user = trim_user(user)
|
2017-11-30 15:49:42 +00:00
|
|
|
olduser = session.query(User).filter(User.id == user['id'])
|
2016-01-14 20:41:14 +00:00
|
|
|
if olduser:
|
2017-11-30 15:49:42 +00:00
|
|
|
if not update:
|
|
|
|
return
|
2016-01-14 20:41:14 +00:00
|
|
|
olduser.delete()
|
2017-11-30 15:49:42 +00:00
|
|
|
nuser = User()
|
|
|
|
for key, value in user.items():
|
|
|
|
setattr(nuser, key, value)
|
|
|
|
user = nuser
|
|
|
|
if update:
|
|
|
|
session.add(user)
|
2016-12-05 22:02:26 +00:00
|
|
|
logger.debug('Adding entry')
|
2016-01-14 20:41:14 +00:00
|
|
|
entry = session.query(ExtractorEntry).filter(ExtractorEntry.user==user.id).first()
|
|
|
|
if not entry:
|
|
|
|
entry = ExtractorEntry(user=user.id)
|
|
|
|
session.add(entry)
|
2016-12-05 22:02:26 +00:00
|
|
|
logger.debug(entry.pending)
|
2016-01-14 20:41:14 +00:00
|
|
|
entry.pending = True
|
|
|
|
entry.cursor = -1
|
|
|
|
session.commit()
|
2017-11-30 15:49:42 +00:00
|
|
|
session.close()
|
2016-09-14 17:53:56 +00:00
|
|
|
|
2016-01-14 20:41:14 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
def download_entry(wq, entry_id, dburi=None, recursive=False):
|
|
|
|
session = make_session(dburi)
|
|
|
|
if not session:
|
|
|
|
raise Exception("Provide dburi or session")
|
|
|
|
logger.info("Downloading entry: %s (%s)" % (entry_id, type(entry_id)))
|
|
|
|
entry = session.query(ExtractorEntry).filter(ExtractorEntry.id==entry_id).first()
|
|
|
|
user = session.query(User).filter(User.id == entry.user).first()
|
|
|
|
download_user(wq, session, user, entry, recursive)
|
|
|
|
session.close()
|
2016-09-14 17:53:56 +00:00
|
|
|
|
2016-01-14 20:41:14 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
def download_user(wq, session, user, entry=None, recursive=False, max_followers=50000):
|
2016-01-14 20:41:14 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
total_followers = user.followers_count
|
2016-03-16 14:23:47 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
if total_followers > max_followers:
|
|
|
|
entry.pending = False
|
|
|
|
logger.info("Too many followers for user: %s" % user.screen_name)
|
|
|
|
session.add(entry)
|
|
|
|
session.commit()
|
|
|
|
return
|
2016-01-14 20:41:14 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
if not entry:
|
|
|
|
entry = session.query(ExtractorEntry).filter(ExtractorEntry.user==user.id).first() or ExtractorEntry(user=user.id)
|
|
|
|
session.add(entry)
|
|
|
|
session.commit()
|
2016-01-14 20:41:14 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
pending = True
|
|
|
|
cursor = entry.cursor
|
|
|
|
uid = user.id
|
|
|
|
name = user.name
|
2016-03-16 14:23:47 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
logger.info("#"*20)
|
|
|
|
logger.info("Getting %s - %s" % (uid, name))
|
|
|
|
logger.info("Cursor %s" % cursor)
|
|
|
|
logger.info("Using account: %s" % wq.name)
|
2016-03-16 14:23:47 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
_fetched_followers = 0
|
2016-01-14 20:41:14 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
def fetched_followers():
|
|
|
|
return session.query(Following).filter(Following.isfollowed==uid).count()
|
|
|
|
|
|
|
|
attempts = 0
|
|
|
|
while cursor > 0 or fetched_followers() < total_followers:
|
2016-01-14 20:41:14 +00:00
|
|
|
try:
|
|
|
|
resp = wq.followers.ids(user_id=uid, cursor=cursor)
|
|
|
|
except TwitterHTTPError as ex:
|
2017-11-30 15:49:42 +00:00
|
|
|
attempts += 1
|
|
|
|
if ex.e.code in (401, ) or attempts > 3:
|
2016-01-14 20:41:14 +00:00
|
|
|
logger.info('Not authorized for user: {}'.format(uid))
|
2017-11-30 15:49:42 +00:00
|
|
|
entry.errors = ex.message
|
|
|
|
break
|
|
|
|
if 'ids' not in resp:
|
2016-01-14 20:41:14 +00:00
|
|
|
logger.info("Error with id %s %s" % (uid, resp))
|
2017-11-30 15:49:42 +00:00
|
|
|
entry.pending = False
|
|
|
|
entry.errors = "No ids in response: %s" % resp
|
|
|
|
break
|
|
|
|
|
|
|
|
logger.info("New followers: %s" % len(resp['ids']))
|
|
|
|
if recursive:
|
|
|
|
newusers = get_users(wq, resp)
|
|
|
|
for newuser in newusers:
|
|
|
|
add_user(session=session, user=newuser)
|
|
|
|
|
|
|
|
if 'ids' not in resp or not resp['ids']:
|
|
|
|
logger.info('NO IDS in response')
|
|
|
|
break
|
|
|
|
for i in resp['ids']:
|
|
|
|
existing_user = session.query(Following).\
|
|
|
|
filter(Following.isfollowed == uid).\
|
|
|
|
filter(Following.follower == i).first()
|
|
|
|
now = int(time.time())
|
|
|
|
if existing_user:
|
|
|
|
existing_user.created_at_stamp = now
|
|
|
|
else:
|
|
|
|
f = Following(isfollowed=uid,
|
|
|
|
follower=i,
|
|
|
|
created_at_stamp=now)
|
|
|
|
session.add(f)
|
2016-01-14 20:41:14 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
logger.info("Fetched: %s/%s followers" % (fetched_followers(),
|
|
|
|
total_followers))
|
|
|
|
entry.cursor = resp["next_cursor"]
|
2016-01-14 20:41:14 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
session.add(entry)
|
2016-01-14 20:41:14 +00:00
|
|
|
session.commit()
|
2016-09-14 17:53:56 +00:00
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
logger.info("Done getting followers for %s" % uid)
|
|
|
|
|
|
|
|
entry.pending = False
|
|
|
|
entry.busy = False
|
|
|
|
session.add(entry)
|
|
|
|
session.commit()
|
|
|
|
|
|
|
|
logger.debug('Entry: {} - {}'.format(entry.user, entry.pending))
|
|
|
|
sys.stdout.flush()
|
|
|
|
|
|
|
|
|
|
|
|
def classify_user(id_or_name, screen_names, user_ids):
|
|
|
|
try:
|
|
|
|
int(id_or_name)
|
|
|
|
user_ids.append(id_or_name)
|
|
|
|
logger.debug("Added user id")
|
|
|
|
except ValueError:
|
|
|
|
logger.debug("Added screen_name")
|
|
|
|
screen_names.append(id_or_name.split('@')[-1])
|
2016-01-14 20:41:14 +00:00
|
|
|
|
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
def extract(wq, recursive=False, user=None, initfile=None, dburi=None, extractor_name=None):
|
|
|
|
signal.signal(signal.SIGINT, signal_handler)
|
|
|
|
|
|
|
|
if not dburi:
|
|
|
|
dburi = 'sqlite:///%s.db' % extractor_name
|
|
|
|
|
|
|
|
session = make_session(dburi)
|
|
|
|
session.query(ExtractorEntry).update({ExtractorEntry.busy: False})
|
|
|
|
session.commit()
|
|
|
|
|
|
|
|
|
|
|
|
if not (user or initfile):
|
|
|
|
logger.info('Using pending users from last session')
|
|
|
|
else:
|
|
|
|
screen_names = []
|
|
|
|
user_ids = []
|
|
|
|
if user:
|
|
|
|
classify_user(user, screen_names, user_ids)
|
|
|
|
elif initfile:
|
|
|
|
logger.info("No user. I will open %s" % initfile)
|
|
|
|
with open(initfile, 'r') as f:
|
|
|
|
for line in f:
|
|
|
|
user = line.strip().split(',')[0]
|
|
|
|
classify_user(user, screen_names, user_ids)
|
|
|
|
|
|
|
|
def missing_user(ix, column=User.screen_name):
|
|
|
|
res = session.query(User).filter(column == ix).count() == 0
|
|
|
|
if res:
|
|
|
|
logger.info("Missing user %s. Count: %s" % (ix, res))
|
|
|
|
return res
|
|
|
|
|
|
|
|
screen_names = list(filter(missing_user, screen_names))
|
|
|
|
user_ids = list(filter(partial(missing_user, column=User.id_str), user_ids))
|
|
|
|
nusers = []
|
|
|
|
logger.info("Missing user ids: %s" % user_ids)
|
|
|
|
logger.info("Missing screen names: %s" % screen_names)
|
|
|
|
if screen_names:
|
|
|
|
nusers = list(get_users(wq, screen_names, by_name=True))
|
|
|
|
if user_ids:
|
|
|
|
nusers += list(get_users(wq, user_ids, by_name=False))
|
|
|
|
|
|
|
|
for i in nusers:
|
|
|
|
add_user(dburi=dburi, user=i)
|
|
|
|
|
|
|
|
total_users = session.query(sqlalchemy.func.count(User.id)).scalar()
|
|
|
|
logger.info('Total users: {}'.format(total_users))
|
|
|
|
|
|
|
|
de = partial(download_entry, wq, dburi=dburi)
|
|
|
|
pending = pending_entries(dburi)
|
|
|
|
session.close()
|
|
|
|
|
|
|
|
for i in tqdm(parallel(de, pending), desc='Downloading users', total=total_users):
|
|
|
|
logger.info("Got %s" % i)
|
|
|
|
|
|
|
|
|
|
|
|
def pending_entries(dburi):
|
|
|
|
session = make_session(dburi)
|
|
|
|
while True:
|
|
|
|
candidate, entry = session.query(User, ExtractorEntry).\
|
|
|
|
filter(ExtractorEntry.user == User.id).\
|
|
|
|
filter(ExtractorEntry.pending == True).\
|
|
|
|
filter(ExtractorEntry.busy == False).\
|
|
|
|
order_by(User.followers_count).first()
|
|
|
|
if candidate:
|
|
|
|
entry.busy = True
|
|
|
|
session.add(entry)
|
|
|
|
session.commit()
|
|
|
|
yield int(entry.id)
|
|
|
|
continue
|
|
|
|
if session.query(ExtractorEntry).\
|
|
|
|
filter(ExtractorEntry.busy == True).count() > 0:
|
|
|
|
time.sleep(1)
|
|
|
|
continue
|
|
|
|
logger.info("No more pending entries")
|
|
|
|
break
|
|
|
|
session.close()
|
|
|
|
|
2016-01-14 20:41:14 +00:00
|
|
|
def get_tweet(c, tid):
|
|
|
|
return c.statuses.show(id=tid)
|
|
|
|
|
|
|
|
def search_tweet(c, query):
|
|
|
|
return c.search.tweets(q=query)
|
|
|
|
|
2016-03-16 14:23:47 +00:00
|
|
|
def user_timeline(c, query):
|
|
|
|
try:
|
|
|
|
return c.statuses.user_timeline(user_id=int(query))
|
|
|
|
except ValueError:
|
|
|
|
return c.statuses.user_timeline(screen_name=query)
|
|
|
|
|
2016-01-14 20:41:14 +00:00
|
|
|
def get_user(c, user):
|
|
|
|
try:
|
|
|
|
int(user)
|
|
|
|
return c.users.lookup(user_id=user)[0]
|
|
|
|
except ValueError:
|
|
|
|
return c.users.lookup(screen_name=user)[0]
|
2016-12-05 22:02:26 +00:00
|
|
|
|
|
|
|
def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False):
|
2018-03-20 12:29:18 +00:00
|
|
|
cached = cached_id(tweetid, folder)
|
2017-05-21 19:27:46 +00:00
|
|
|
tweet = None
|
2016-12-05 22:02:26 +00:00
|
|
|
if update or not cached:
|
2017-05-21 19:27:46 +00:00
|
|
|
tweet = get_tweet(wq, tweetid)
|
2018-03-20 12:29:18 +00:00
|
|
|
js = json.dumps(tweet)
|
2016-12-05 22:02:26 +00:00
|
|
|
if write:
|
2017-05-21 19:27:46 +00:00
|
|
|
if tweet:
|
2018-03-20 12:29:18 +00:00
|
|
|
write_json(js, folder)
|
2016-12-05 22:02:26 +00:00
|
|
|
else:
|
|
|
|
print(js)
|
|
|
|
|
|
|
|
|
2018-03-20 12:29:18 +00:00
|
|
|
def cached_id(oid, folder):
|
2016-12-05 22:02:26 +00:00
|
|
|
tweet = None
|
2018-03-20 12:29:18 +00:00
|
|
|
file = os.path.join(folder, '%s.json' % oid)
|
2016-12-05 22:02:26 +00:00
|
|
|
if os.path.exists(file) and os.path.isfile(file):
|
|
|
|
try:
|
2018-03-20 12:29:18 +00:00
|
|
|
# print('%s: Object exists' % oid)
|
2016-12-05 22:02:26 +00:00
|
|
|
with open(file) as f:
|
|
|
|
tweet = json.load(f)
|
|
|
|
except Exception as ex:
|
2018-03-20 12:29:18 +00:00
|
|
|
logger.error('Error getting cached version of {}: {}'.format(oid, ex))
|
2016-12-05 22:02:26 +00:00
|
|
|
return tweet
|
|
|
|
|
2018-03-20 12:29:18 +00:00
|
|
|
def write_json(js, folder, oid=None):
|
|
|
|
if not oid:
|
|
|
|
oid = js['id']
|
|
|
|
file = id_file(oid, folder)
|
2016-12-05 22:02:26 +00:00
|
|
|
if not os.path.exists(folder):
|
|
|
|
os.makedirs(folder)
|
|
|
|
with open(file, 'w') as f:
|
2018-03-20 12:29:18 +00:00
|
|
|
json.dump(js, f)
|
|
|
|
logger.info('Written {} to file {}'.format(oid, file))
|
2016-12-05 22:02:26 +00:00
|
|
|
|
2018-03-20 12:29:18 +00:00
|
|
|
def id_file(oid, folder):
|
|
|
|
return os.path.join(folder, '%s.json' % oid)
|
2016-12-05 22:02:26 +00:00
|
|
|
|
2018-03-20 12:29:18 +00:00
|
|
|
def fail_file(oid, folder):
|
2016-12-05 22:02:26 +00:00
|
|
|
failsfolder = os.path.join(folder, 'failed')
|
|
|
|
if not os.path.exists(failsfolder):
|
|
|
|
os.makedirs(failsfolder)
|
2018-03-20 12:29:18 +00:00
|
|
|
return os.path.join(failsfolder, '%s.failed' % oid)
|
2016-12-05 22:02:26 +00:00
|
|
|
|
2018-03-20 12:29:18 +00:00
|
|
|
def id_failed(oid, folder):
|
|
|
|
return os.path.isfile(fail_file(oid, folder))
|
2016-12-05 22:02:26 +00:00
|
|
|
|
2018-03-20 12:29:18 +00:00
|
|
|
def tweet_download_batch(wq, batch):
|
|
|
|
tweets = wq.statuses.lookup(_id=",".join(batch), map=True)['id']
|
2019-04-30 17:15:15 +00:00
|
|
|
for tid, tweet in tweets.items():
|
|
|
|
yield tid, tweet
|
2018-03-20 12:29:18 +00:00
|
|
|
|
|
|
|
def user_download_batch(wq, batch):
|
|
|
|
screen_names = []
|
|
|
|
user_ids = []
|
|
|
|
for elem in batch:
|
|
|
|
try:
|
2018-08-30 13:57:49 +00:00
|
|
|
int(elem)
|
|
|
|
user_ids.append(str(elem))
|
2018-03-20 12:29:18 +00:00
|
|
|
except ValueError:
|
2018-08-21 10:54:25 +00:00
|
|
|
screen_names.append(elem.lower())
|
2018-09-16 10:20:41 +00:00
|
|
|
args = {}
|
|
|
|
if user_ids:
|
|
|
|
args['user_id'] = ','.join(user_ids)
|
|
|
|
if screen_names:
|
|
|
|
args['screen_name'] = ','.join(screen_names)
|
|
|
|
try:
|
|
|
|
users = wq.users.lookup(**args)
|
|
|
|
except TwitterHTTPError as ex:
|
|
|
|
if ex.e.code in (404,):
|
|
|
|
users = []
|
|
|
|
else:
|
|
|
|
raise
|
2018-03-20 12:29:18 +00:00
|
|
|
found_ids = []
|
|
|
|
found_names = []
|
|
|
|
for user in users:
|
2018-09-16 10:20:41 +00:00
|
|
|
uid = user['id_str']
|
2018-03-20 12:29:18 +00:00
|
|
|
if uid in user_ids:
|
|
|
|
found_ids.append(uid)
|
|
|
|
yield (uid, user)
|
2018-08-21 10:54:25 +00:00
|
|
|
uname = user['screen_name'].lower()
|
2018-03-20 12:29:18 +00:00
|
|
|
if uname in screen_names:
|
|
|
|
found_names.append(uname)
|
|
|
|
yield (uname, user)
|
|
|
|
for uid in set(user_ids) - set(found_ids):
|
|
|
|
yield (uid, None)
|
|
|
|
for name in set(screen_names) - set(found_names):
|
|
|
|
yield (name, None)
|
|
|
|
|
|
|
|
|
2019-04-30 17:15:15 +00:00
|
|
|
def dump_result(oid, obj, folder, ignore_fails=True):
|
|
|
|
if obj:
|
|
|
|
try:
|
|
|
|
write_json(obj, folder=folder, oid=oid)
|
|
|
|
failed = fail_file(oid, folder)
|
|
|
|
if os.path.exists(failed):
|
|
|
|
os.remove(failed)
|
|
|
|
except Exception as ex:
|
|
|
|
logger.error('%s: %s' % (oid, ex))
|
|
|
|
if not ignore_fails:
|
|
|
|
raise
|
|
|
|
else:
|
|
|
|
logger.info('Object not recovered: {}'.format(oid))
|
|
|
|
with open(fail_file(oid, folder), 'w') as f:
|
|
|
|
print('Object not found', file=f)
|
|
|
|
|
|
|
|
def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=False,
|
2018-03-20 12:29:18 +00:00
|
|
|
batch_method=tweet_download_batch):
|
2019-04-30 17:15:15 +00:00
|
|
|
|
|
|
|
done = Queue()
|
|
|
|
|
|
|
|
down = Queue()
|
|
|
|
|
|
|
|
|
|
|
|
def filter_list(lst, done, down):
|
|
|
|
print('filtering')
|
|
|
|
for oid in lst:
|
|
|
|
# print('Checking {}'.format(line))
|
|
|
|
cached = cached_id(oid, folder)
|
|
|
|
if (cached and not update):
|
|
|
|
done.put((oid, cached))
|
|
|
|
elif (id_failed(oid, folder) and not retry_failed):
|
|
|
|
done.put((oid, None))
|
|
|
|
else:
|
|
|
|
down.put(oid)
|
|
|
|
down.put(None)
|
|
|
|
|
|
|
|
def download_results(batch_method, down, done):
|
|
|
|
def gen():
|
|
|
|
while True:
|
|
|
|
r = down.get()
|
|
|
|
if not r:
|
|
|
|
return
|
|
|
|
yield r
|
|
|
|
|
|
|
|
for t in parallel(batch_method, gen(), 100):
|
|
|
|
done.put(t)
|
|
|
|
|
|
|
|
def batch(*args, **kwargs):
|
|
|
|
return batch_method(wq, *args, **kwargs)
|
|
|
|
|
|
|
|
tc = threading.Thread(target=filter_list, args=(lst, done, down), daemon=True)
|
|
|
|
tc.start()
|
|
|
|
td = threading.Thread(target=download_results, args=(batch, down, done), daemon=True)
|
|
|
|
td.start()
|
|
|
|
|
|
|
|
def check_threads(ts, done):
|
|
|
|
for t in ts:
|
|
|
|
t.join()
|
|
|
|
done.put(None)
|
|
|
|
|
|
|
|
wait = threading.Thread(target=check_threads, args=([tc, td], done), daemon=True)
|
|
|
|
wait.start()
|
|
|
|
|
|
|
|
while True:
|
|
|
|
rec = done.get()
|
|
|
|
|
|
|
|
if rec is None:
|
|
|
|
break
|
|
|
|
|
|
|
|
oid, obj = rec
|
|
|
|
dump_result(oid, obj, folder, ignore_fails)
|
|
|
|
yield rec
|
|
|
|
|
|
|
|
wait.join()
|
2018-03-20 12:29:18 +00:00
|
|
|
|
|
|
|
|
|
|
|
def download_file(wq, csvfile, folder, column=0, delimiter=',',
|
|
|
|
header=False, quotechar='"', batch_method=tweet_download_batch,
|
|
|
|
**kwargs):
|
|
|
|
with open(csvfile) as f:
|
|
|
|
csvreader = csv.reader(f, delimiter=str(delimiter), quotechar=str(quotechar))
|
|
|
|
if header:
|
|
|
|
next(csvreader)
|
2019-04-30 17:15:15 +00:00
|
|
|
|
|
|
|
def reader(r):
|
|
|
|
for row in csvreader:
|
|
|
|
if len(row) > column:
|
|
|
|
yield row[column].strip()
|
|
|
|
|
|
|
|
|
|
|
|
for res in download_list(wq, reader(csvreader), folder, batch_method=batch_method,
|
2018-03-20 12:29:18 +00:00
|
|
|
**kwargs):
|
|
|
|
yield res
|
|
|
|
|
2017-11-30 15:49:42 +00:00
|
|
|
|
|
|
|
def download_timeline(wq, user):
|
|
|
|
return wq.statuses.user_timeline(id=user)
|
|
|
|
|
|
|
|
|
2017-12-19 18:04:10 +00:00
|
|
|
def _consume_feed(func, feed_control=None, **kwargs):
|
2017-11-30 15:49:42 +00:00
|
|
|
'''
|
|
|
|
Get all the tweets using pagination and a given method.
|
|
|
|
It can be controlled with the `count` parameter.
|
|
|
|
|
2017-12-19 18:04:10 +00:00
|
|
|
If max_count < 0 => Loop until the whole feed is consumed.
|
|
|
|
If max_count == 0 => Only call the API once, with the default values.
|
|
|
|
If max_count > 0 => Get max_count tweets from the feed.
|
2017-11-30 15:49:42 +00:00
|
|
|
'''
|
2017-12-19 18:04:10 +00:00
|
|
|
remaining = int(kwargs.pop('max_count', 0))
|
|
|
|
count = int(kwargs.get('count', -1))
|
2017-11-30 15:49:42 +00:00
|
|
|
limit = False
|
|
|
|
|
2017-12-19 18:04:10 +00:00
|
|
|
# We need to at least perform a query, so we simulate a do-while
|
|
|
|
# by running once with no limit and updating the condition at the end
|
|
|
|
with tqdm(total=remaining) as pbar:
|
|
|
|
while not limit:
|
|
|
|
if remaining > 0 and ((count < 0) or (count > remaining)):
|
|
|
|
kwargs['count'] = remaining
|
|
|
|
resp, stop = feed_control(func, kwargs, remaining=remaining, batch_size=count)
|
|
|
|
if not resp:
|
|
|
|
return
|
|
|
|
for entry in resp:
|
|
|
|
yield entry
|
|
|
|
pbar.update(len(resp))
|
|
|
|
limit = stop
|
|
|
|
if remaining < 0:
|
|
|
|
# If the loop was run with a negative remaining, it will only stop
|
|
|
|
# when the control function tells it to.
|
|
|
|
continue
|
|
|
|
# Otherwise, check if we have already downloaded all the required items
|
|
|
|
remaining -= len(resp)
|
|
|
|
limit = limit or remaining <= 0
|
|
|
|
|
|
|
|
|
|
|
|
def consume_tweets(*args, **kwargs):
|
|
|
|
return _consume_feed(*args, feed_control=_tweets_control, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
def consume_users(*args, **kwargs):
|
|
|
|
return _consume_feed(*args, feed_control=_users_control, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
def _tweets_control(func, apiargs, remaining=0, **kwargs):
|
|
|
|
''' Return a list of entries, the remaining '''
|
|
|
|
|
|
|
|
resp = func(**apiargs)
|
|
|
|
if not resp:
|
|
|
|
return None, True
|
|
|
|
# Update the arguments for the next call
|
|
|
|
# Two options: either resp is a list, or a dict like:
|
|
|
|
# {'statuses': ... 'search_metadata': ...}
|
|
|
|
if isinstance(resp, dict) and 'search_metadata' in resp:
|
|
|
|
resp = resp['statuses']
|
|
|
|
max_id = min(s['id'] for s in resp) - 1
|
|
|
|
apiargs['max_id'] = max_id
|
|
|
|
return resp, False
|
|
|
|
|
|
|
|
|
|
|
|
def _users_control(func, apiargs, remaining=0, **kwargs):
|
|
|
|
resp = func(**apiargs)
|
|
|
|
stop = True
|
|
|
|
# Update the arguments for the next call
|
|
|
|
if 'next_cursor' in resp:
|
|
|
|
cursor = resp['next_cursor']
|
|
|
|
apiargs['cursor'] = cursor
|
|
|
|
if int(cursor) != -1:
|
|
|
|
stop = False
|
|
|
|
return resp['users'], stop
|