mirror of
				https://github.com/balkian/bitter.git
				synced 2025-10-25 12:48:23 +00:00 
			
		
		
		
	Version 0.10.3
* Ability to include "optional" fields from tweets (e.g., retweeted_status). * Optional caching (for very large datasets)
This commit is contained in:
		| @@ -1 +1 @@ | ||||
| 0.10.2 | ||||
| 0.10.3 | ||||
|   | ||||
| @@ -6,10 +6,12 @@ http://github.com/balkian/bitter | ||||
| import os | ||||
|  | ||||
| from .version import __version__ | ||||
| from . import config as bconf | ||||
|  | ||||
| def easy(*args, **kwargs): | ||||
| def easy(conffile=bconf.CONFIG_FILE): | ||||
|     from .crawlers import TwitterQueue | ||||
|     return TwitterQueue.from_credentials(*args, **kwargs) | ||||
|  | ||||
|     return TwitterQueue.from_config(conffile=conffile) | ||||
|  | ||||
| __all__ = ['cli', 'config', 'crawlers', 'models', 'utils' ] | ||||
|  | ||||
|   | ||||
| @@ -8,8 +8,6 @@ import time | ||||
| import sqlalchemy.types | ||||
| import threading | ||||
| import sqlite3 | ||||
| import operator | ||||
| from functools import reduce | ||||
| from tqdm import tqdm | ||||
|  | ||||
| from sqlalchemy import exists | ||||
| @@ -19,7 +17,6 @@ from bitter import config as bconf | ||||
| from bitter.models import make_session, User, ExtractorEntry, Following | ||||
|  | ||||
| import sys | ||||
| import csv as tsv | ||||
| if sys.version_info <= (3, 0): | ||||
|     from contextlib2 import ExitStack | ||||
| else: | ||||
| @@ -29,48 +26,29 @@ else: | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
|  | ||||
| def serialize(function): | ||||
|     '''Common options to serialize output to CSV or other formats''' | ||||
|  | ||||
|     @click.option('--csv', help='Print each object as a csv row. Provide a list of comma-separated fields to print.', default='', type=str) | ||||
|     @click.option('--fields', help='Provide a list of comma-separated fields to print.', default='', type=str) | ||||
|     @click.option('--ignore_missing', help='Do not show warnings for missing fields.', is_flag=True) | ||||
|     @click.option('--header', help='Header that will be printed at the beginning of the file', default=None) | ||||
|     @click.option('--csv', help='Print each object as a csv row.', is_flag=True) | ||||
|     @click.option('--jsonlines', '--json', help='Print each object as JSON in a new line.', is_flag=True) | ||||
|     @click.option('--indented', help='Print each object as an indented JSON object', is_flag=True) | ||||
|     @click.option('--outdelimiter', help='Delimiter for some output formats, such as CSV. It defaults to \t', default='\t') | ||||
|     @click.option('--outfile', help='Output file. It defaults to STDOUT', default=sys.stdout) | ||||
|     def decorated(csv, header, jsonlines, indented, outfile, **kwargs): | ||||
|         if header: | ||||
|             print(header) | ||||
|  | ||||
|     def decorated(fields, ignore_missing, header, csv, jsonlines, indented, outfile, outdelimiter, **kwargs): | ||||
|         it = function(**kwargs) | ||||
|  | ||||
|         def do(out): | ||||
|  | ||||
|         outformat = 'json' | ||||
|         if csv: | ||||
|                 delimiter = '\t' | ||||
|                 writer = tsv.writer(out, quoting=tsv.QUOTE_ALL, delimiter=delimiter) | ||||
|                 if header is None: | ||||
|                     # Print fields as header unless told otherwise | ||||
|                     print(csv.replace(',', delimiter), file=out) | ||||
|                 fields = list(token.strip().split('.') for token in csv.split(',')) | ||||
|                 for obj in it: | ||||
|                     writer.writerow(list(reduce(operator.getitem, field, obj) for field in fields)) | ||||
|             outformat = 'csv' | ||||
|         elif jsonlines: | ||||
|                 for obj in it: | ||||
|                     print(json.dumps(obj, sort_keys=True), file=out) | ||||
|             outformat = 'jsonlines' | ||||
|         elif indented: | ||||
|                 for obj in it: | ||||
|                     print(json.dumps(obj, indent=4, sort_keys=True), file=out) | ||||
|             else: | ||||
|                 for obj in it: | ||||
|                     print(obj, file=out) | ||||
|             outformat = 'indented' | ||||
|  | ||||
|         if outfile is sys.stdout: | ||||
|             return do(sys.stdout) | ||||
|         return utils.serialized(it, outfile, outformat=outformat, fields=fields.split(','), ignore_missing=ignore_missing, header=header, delimiter=outdelimiter) | ||||
|  | ||||
|         with open(outfile, 'w') as out: | ||||
|             return do(out) | ||||
|     return decorated | ||||
|  | ||||
|  | ||||
| @@ -190,13 +168,14 @@ The result is stored as individual json files in your folder of choice.''') | ||||
| @click.option('-u', '--update', is_flag=True, default=False, help='Download tweet even if it is already present. WARNING: it will overwrite existing files!') | ||||
| @click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads') | ||||
| @click.option('-d', '--delimiter', default=",") | ||||
| @click.option('-n', '--nocache', is_flag=True, default=False, help='Do not cache results') | ||||
| @click.option('--skip', help='Discard the first DISCARD lines (use them as a header)', default=0) | ||||
| @click.option('--commentchar', help='Lines starting with this character will be ignored', default=None) | ||||
| @click.option('-q', '--quotechar', default='"') | ||||
| @click.option('-c', '--column', type=int, default=0) | ||||
| @serialize | ||||
| @click.pass_context | ||||
| def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, skip, quotechar, commentchar, column): | ||||
| def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, nocache, skip, quotechar, commentchar, column): | ||||
|     if update and not click.confirm('This may overwrite existing tweets. Continue?'): | ||||
|         click.echo('Cancelling') | ||||
|         return | ||||
| @@ -204,8 +183,7 @@ def get_tweets(ctx, tweetsfile, folder, update, retry, delimiter, skip, quotecha | ||||
|  | ||||
|     status = tqdm('Queried') | ||||
|     failed = 0 | ||||
|     for tid, obj in utils.download_file(wq, tweetsfile, folder, delimiter=delimiter, | ||||
|                                         batch_method=utils.tweet_download_batch, | ||||
|     for tid, obj in utils.download_tweets_file(wq, tweetsfile, folder, delimiter=delimiter, cache=not nocache, | ||||
|                                                skip=skip, quotechar=quotechar, commentchar=commentchar, | ||||
|                                                column=column, update=update, retry_failed=retry): | ||||
|         status.update(1) | ||||
| @@ -264,6 +242,7 @@ def get_user(user, dry_run, folder, update): | ||||
| @click.option('-f', '--folder', default="users") | ||||
| @click.option('-u', '--update', is_flag=True, default=False, help='Download user even if it is already present. WARNING: it will overwrite existing files!') | ||||
| @click.option('-r', '--retry', is_flag=True, default=False, help='Retry failed downloads') | ||||
| @click.option('-n', '--nocache', is_flag=True, default=False, help='Do not cache results') | ||||
| @click.option('-d', '--delimiter', default=",") | ||||
| @click.option('--skip', help='Discard the first SKIP lines (e.g., use them as a header)', | ||||
|               is_flag=True, default=False) | ||||
| @@ -272,15 +251,15 @@ def get_user(user, dry_run, folder, update): | ||||
| @click.option('-c', '--column', type=int, default=0) | ||||
| @serialize | ||||
| @click.pass_context | ||||
| def get_users(ctx, usersfile, folder, update, retry, delimiter, skip, quotechar, commentchar, column): | ||||
| def get_users(ctx, usersfile, folder, update, retry, nocache, delimiter, skip, quotechar, commentchar, column): | ||||
|     if update and not click.confirm('This may overwrite existing users. Continue?'): | ||||
|         click.echo('Cancelling') | ||||
|         return | ||||
|     wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) | ||||
|     for i in utils.download_file(wq, usersfile, folder, delimiter=delimiter, | ||||
|                                  batch_method=utils.user_download_batch, | ||||
|     for i in utils.download_users_file(wq, usersfile, folder, delimiter=delimiter, | ||||
|                                        update=update, retry_failed=retry, | ||||
|                                        skip=skip, quotechar=quotechar, | ||||
|                                        cache=not nocache, | ||||
|                                        commentchar=commentchar, | ||||
|                                        column=column): | ||||
|         yield i | ||||
| @@ -480,7 +459,6 @@ def extract(ctx, recursive, user, name, initfile): | ||||
| @extractor.command('reset') | ||||
| @click.pass_context | ||||
| def reset_extractor(ctx): | ||||
|     wq = crawlers.TwitterQueue.from_config(conffile=bconf.CONFIG_FILE) | ||||
|     db = ctx.obj['DBURI'] | ||||
|     session = make_session(db) | ||||
|     session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False}) | ||||
|   | ||||
| @@ -19,7 +19,9 @@ import queue | ||||
| import threading | ||||
| from select import select | ||||
|  | ||||
| from functools import partial | ||||
| import operator | ||||
|  | ||||
| from functools import partial, reduce | ||||
|  | ||||
| from tqdm import tqdm | ||||
|  | ||||
| @@ -473,22 +475,22 @@ def get_user(c, user): | ||||
|     except ValueError: | ||||
|         return c.users.lookup(screen_name=user)[0] | ||||
|  | ||||
| def download_tweet(wq, tweetid, write=True, folder="downloaded_tweets", update=False): | ||||
| def download_tweet(wq, tweetid, cache=True, folder="downloaded_tweets", update=False): | ||||
|     tweet = cached_id(tweetid, folder) | ||||
|     if update or not tweet: | ||||
|         tweet = get_tweet(wq, tweetid) | ||||
|     if write and update: | ||||
|     if cache and update: | ||||
|         if tweet: | ||||
|             js = json.dumps(tweet) | ||||
|             write_json(js, folder) | ||||
|     yield tweet | ||||
|  | ||||
|  | ||||
| def download_user(wq, userid, write=True, folder="downloaded_users", update=False): | ||||
| def download_user(wq, userid, cache=True, folder="downloaded_users", update=False): | ||||
|     user = cached_id(userid, folder) | ||||
|     if update or not user: | ||||
|         user = get_user(wq, userid) | ||||
|     if write and update: | ||||
|     if cache and update: | ||||
|         if user: | ||||
|             write_json(user, folder, aliases=[user['screen_name'], ]) | ||||
|     yield user | ||||
| @@ -589,7 +591,8 @@ def dump_result(oid, obj, folder, ignore_fails=True): | ||||
|         with open(fail_file(oid, folder), 'w') as f: | ||||
|             print('Object not found', file=f) | ||||
|  | ||||
| def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=False, | ||||
|  | ||||
| def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fails=False, cache=True, | ||||
|                   batch_method=tweet_download_batch): | ||||
|  | ||||
|     done = Queue() | ||||
| @@ -647,13 +650,24 @@ def download_list(wq, lst, folder, update=False, retry_failed=False, ignore_fail | ||||
|             break | ||||
|  | ||||
|         oid, obj = rec | ||||
|         if cache or (not obj): | ||||
|             dump_result(oid, obj, folder, ignore_fails) | ||||
|         yield rec | ||||
|  | ||||
|     wait.join() | ||||
|  | ||||
|  | ||||
| def download_file(wq, csvfile, folder, column=0, delimiter=',', skip=0, | ||||
| def download_tweets_file(*args, **kwargs): | ||||
|     kwargs['batch_method'] = tweet_download_batch | ||||
|     yield from download_file(*args, **kwargs) | ||||
|  | ||||
|  | ||||
| def download_users_file(*args, **kwargs): | ||||
|     kwargs['batch_method'] = user_download_batch | ||||
|     yield from download_file(*args, **kwargs) | ||||
|  | ||||
|  | ||||
| def download_file(wq, csvfile, folder, column=0, delimiter=',', skip=0, cache=True, | ||||
|                   quotechar='"', commentchar=None, batch_method=tweet_download_batch, | ||||
|                   **kwargs): | ||||
|     with open(csvfile) as f: | ||||
| @@ -670,7 +684,7 @@ def download_file(wq, csvfile, folder, column=0, delimiter=',', skip=0, | ||||
|                     yield row[column].strip() | ||||
|  | ||||
|  | ||||
|         for res in download_list(wq, reader(csvreader), folder, batch_method=batch_method, | ||||
|         for res in download_list(wq, reader(csvreader), folder, batch_method=batch_method, cache=cache, | ||||
|                                  **kwargs): | ||||
|             yield res | ||||
|  | ||||
| @@ -748,3 +762,42 @@ def _users_control(func, apiargs, remaining=0, **kwargs): | ||||
|         if int(cursor) != -1: | ||||
|             stop = False | ||||
|     return resp['users'], stop | ||||
|  | ||||
|  | ||||
| def serialized(it, outfile, outformat='csv', fields=[], header=None, ignore_missing=False, delimiter='\t'): | ||||
|     outformat = outformat.lower() | ||||
|     def do(out): | ||||
|  | ||||
|         if outformat == 'csv': | ||||
|             writer = csv.writer(out, quoting=csv.QUOTE_ALL, delimiter=delimiter) | ||||
|             if header != '': | ||||
|                 h = header | ||||
|                 if h is None: | ||||
|                     h = delimiter.join(fields) | ||||
|                 print(h, file=out) | ||||
|             attrs = list(token.strip().split('.') for token in fields) | ||||
|             for obj in it: | ||||
|                 values = [] | ||||
|                 for attr in attrs: | ||||
|                     try: | ||||
|                         values.append(reduce(operator.getitem, attr, obj)) | ||||
|                     except KeyError: | ||||
|                         if not ignore_missing: | ||||
|                             print('Key not present: {}'.format(attr), file=sys.stderr) | ||||
|                         values.append(None) | ||||
|                 writer.writerow(values) | ||||
|         elif outformat == 'jsonlines': | ||||
|             for obj in it: | ||||
|                 print(json.dumps(obj, sort_keys=True), file=out) | ||||
|         elif outformat == 'indented': | ||||
|             for obj in it: | ||||
|                 print(json.dumps(obj, indent=4, sort_keys=True), file=out) | ||||
|         else: | ||||
|             for obj in it: | ||||
|                 print(obj, file=out) | ||||
|  | ||||
|     if outfile is sys.stdout: | ||||
|         return do(sys.stdout) | ||||
|  | ||||
|     with open(outfile, 'w') as out: | ||||
|         return do(out) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user