1
0
mirror of https://github.com/balkian/bitter.git synced 2024-12-22 08:28:12 +00:00

Added stream to CLI

* Save stream to file
* Parse file and get the most important hashtags
This commit is contained in:
J. Fernando Sánchez 2016-11-19 20:16:56 +01:00
parent 738823c8a2
commit 38605ba2c8
4 changed files with 63 additions and 5 deletions

View File

@ -1 +1 @@
0.6.0 0.6.1

View File

@ -6,6 +6,7 @@ import time
import sqlalchemy.types import sqlalchemy.types
import threading import threading
import sqlite3 import sqlite3
from tqdm import tqdm
from sqlalchemy import exists from sqlalchemy import exists
@ -333,14 +334,49 @@ def stream(ctx):
pass pass
@stream.command('get') @stream.command('get')
@click.option('-l', '--locations', default=None)
@click.option('-t', '--track', default=None)
@click.option('-f', '--file', help='File to store the stream of tweets')
@click.pass_context @click.pass_context
def get_stream(ctx): def get_stream(ctx, locations, track, file):
wq = crawlers.StreamQueue.from_credentials(bconf.CREDENTIALS, 1) wq = crawlers.StreamQueue.from_credentials(bconf.CREDENTIALS, 1)
iterator = wq.statuses.sample() query_args = {}
if locations:
query_args['locations'] = locations
if track:
query_args['track'] = track
if not query_args:
iterator = wq.statuses.sample()
else:
iterator = wq.statuses.filter(**query_args)#"-4.25,40.16,-3.40,40.75")
if not file:
file = sys.stdout
else:
file = open(file, 'a')
for tweet in tqdm(iterator):
print(json.dumps(tweet), file=file)
if file != sys.stdout:
file.close()
@stream.command('read')
@click.option('-f', '--file', help='File to read the stream of tweets from')
@click.pass_context
def read_stream(ctx, file):
for tweet in utils.read_file(file, tail=True):
print('{timestamp_ms}- @{screen_name}: {text}'.format(timestamp_ms=tweet['timestamp_ms'], screen_name=tweet['user']['screen_name'], text=tweet['text']))
@stream.command('tags')
@click.option('-f', '--file', help='File to read the stream of tweets from')
@click.argument('limit', required=False, default=None, type=int)
@click.pass_context
def tags_stream(ctx, file, limit):
c = utils.get_hashtags(utils.read_file(file))
for count, tag in c.most_common(limit):
print('{} - {}'.format(count, tag))
for tweet in iterator:
print(tweet)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -12,6 +12,7 @@ from multiprocessing.pool import ThreadPool
from itertools import islice from itertools import islice
from contextlib import contextmanager from contextlib import contextmanager
from itertools import zip_longest from itertools import zip_longest
from collections import Counter
from twitter import TwitterHTTPError from twitter import TwitterHTTPError
@ -86,6 +87,26 @@ def add_credentials(credfile=None, **creds):
f.write('\n') f.write('\n')
def get_hashtags(iter_tweets, best=None):
c = Counter()
for tweet in iter_tweets:
c.update(tag['text'] for tag in tweet.get('entities', {}).get('hashtags', {}))
return c
def read_file(filename, tail=False):
with open(filename) as f:
while True:
line = f.readline()
if line not in (None, '', '\n'):
tweet = json.loads(line.strip())
yield tweet
else:
if tail:
time.sleep(1)
else:
return line
def get_users(wq, ulist, by_name=False, queue=None, max_users=100): def get_users(wq, ulist, by_name=False, queue=None, max_users=100):
t = 'name' if by_name else 'uid' t = 'name' if by_name else 'uid'
logger.debug('Getting users by {}: {}'.format(t, ulist)) logger.debug('Getting users by {}: {}'.format(t, ulist))

View File

@ -1,3 +1,4 @@
sqlalchemy sqlalchemy
twitter twitter
click click
tqdm