From 53bb7edabca59919500167abe8c0cc27c3381907 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= Date: Mon, 19 Mar 2018 14:35:07 +0100 Subject: [PATCH] Add sh scripts --- bin/README.md | 10 ++++++++++ bin/example_query.sh | 1 + bin/extract-hashtags.sh | 13 +++++++++++++ bin/extract-interactions.sh | 15 +++++++++++++++ bin/extract-limits.sh | 16 ++++++++++++++++ bin/extract-media.sh | 16 ++++++++++++++++ bin/extract-users.sh | 28 ++++++++++++++++++++++++++++ bin/extract.sh | 32 ++++++++++++++++++++++++++++++++ bin/extract_extended.sh | 17 +++++++++++++++++ bin/extract_text.sh | 16 ++++++++++++++++ bin/functions.py | 23 +++++++++++++++++++++++ bin/print-hashtags.sh | 1 + bin/print-replies.sh | 14 ++++++++++++++ bin/print-rts.sh | 15 +++++++++++++++ docker-compose.yml | 12 ++++++++++++ 15 files changed, 229 insertions(+) create mode 100644 bin/README.md create mode 100755 bin/example_query.sh create mode 100755 bin/extract-hashtags.sh create mode 100755 bin/extract-interactions.sh create mode 100755 bin/extract-limits.sh create mode 100755 bin/extract-media.sh create mode 100755 bin/extract-users.sh create mode 100755 bin/extract.sh create mode 100755 bin/extract_extended.sh create mode 100755 bin/extract_text.sh create mode 100644 bin/functions.py create mode 100755 bin/print-hashtags.sh create mode 100755 bin/print-replies.sh create mode 100755 bin/print-rts.sh create mode 100644 docker-compose.yml diff --git a/bin/README.md b/bin/README.md new file mode 100644 index 0000000..4928370 --- /dev/null +++ b/bin/README.md @@ -0,0 +1,10 @@ +Scripts to process jsonlines + +To get the jsonlines file, you can use the streaming API or the search api, like so: + +``` +python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines +``` + +To keep track of the query that generated the file, you can save the command in a text file. +For instance, the example above is also in `example_query.sh`. diff --git a/bin/example_query.sh b/bin/example_query.sh new file mode 100755 index 0000000..df64ba0 --- /dev/null +++ b/bin/example_query.sh @@ -0,0 +1 @@ +python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines diff --git a/bin/extract-hashtags.sh b/bin/extract-hashtags.sh new file mode 100755 index 0000000..0d74435 --- /dev/null +++ b/bin/extract-hashtags.sh @@ -0,0 +1,13 @@ +if [ "$#" -lt 1 ] +then + echo "Usage: $0 " + exit 1 +fi + +export FIELDS="created_at,id,text" +for i in "$@" +do + OUTPUT=$i.hashtags.csv + echo "$FIELDS" > $OUTPUT + pv -l $i -N "hashtags $i" | jq -r '. | .created_at as $created_at | .id_str as $id | .entities.hashtags | select(. != null) | .[] | [$created_at, $id, .text] | @csv' >> $OUTPUT +done diff --git a/bin/extract-interactions.sh b/bin/extract-interactions.sh new file mode 100755 index 0000000..f132778 --- /dev/null +++ b/bin/extract-interactions.sh @@ -0,0 +1,15 @@ +if [ "$#" -lt 1 ] +then + echo "Usage: $0 " + exit 1 +fi + +for i in "$@" +do + REPLYOUTPUT=$i.replies.csv + RTOUTPUT=$i.rts.csv + echo 'created_at,id,user_id,reply_user_id' > $REPLYOUTPUT + echo 'created_at,id,user_id,rt_user_id' > $RTOUTPUT + pv -l -N "$i" $i | jq -r '. | select(.in_reply_to_user_id_str != null) | [.created_at, .id_str, .user.id_str, .in_reply_to_user_id_str] | @csv' >> $REPLYOUTPUT + pv -l -N "$i" $i | jq -r '. | select(.retweeted_status != null) | [.created_at, .retweeted_status.id_str, .user.id_str, .retweeted_status.user.id_str] | @csv' >> $RTOUTPUT +done diff --git a/bin/extract-limits.sh b/bin/extract-limits.sh new file mode 100755 index 0000000..dab09bf --- /dev/null +++ b/bin/extract-limits.sh @@ -0,0 +1,16 @@ +if [ "$#" -lt 1 ] +then + echo "Usage: $0 " + exit 1 +fi + +export QUERY='.limit | select(. != null) | [.timestamp_ms, .track] | @csv' + +export FIELDS="timestamp,track" + +for i in "$@" +do + OUTPUT=$i.limits.csv + echo $FIELDS > $OUTPUT + pv -N "$i limits" -l $i | jq -r "$QUERY" >> $OUTPUT +done diff --git a/bin/extract-media.sh b/bin/extract-media.sh new file mode 100755 index 0000000..baf0e7e --- /dev/null +++ b/bin/extract-media.sh @@ -0,0 +1,16 @@ +if [ "$#" -lt 1 ] +then + echo "Usage: $0 " + exit 1 +fi + +export QUERY='select(.id != null) | .id_str as $id | .entities.urls[] | select(.expanded_url | select(. != null) | contains("open.spotify") or contains("youtube.com") or contains("youtu.be")) | [$id, .expanded_url] | @csv' + +export FIELDS="id,url" + +for i in "$@" +do + OUTPUT=$i.media.csv + echo $FIELDS > $OUTPUT + pv -N "$i media" -l $i | jq -r "$QUERY" >> $OUTPUT +done diff --git a/bin/extract-users.sh b/bin/extract-users.sh new file mode 100755 index 0000000..dca193e --- /dev/null +++ b/bin/extract-users.sh @@ -0,0 +1,28 @@ +if [ "$#" -lt 1 ] +then + echo "Usage: $0 " + exit 1 +fi + +export USER_FIELDS="\$created_at,\ +.id_str,\ +.screen_name,\ +.followers_count,\ +.lang,\ +.description,\ +.statuses_count,\ +.favourites_count,\ +.friends_count,\ +.created_at,\ +.name,\ +.location,\ +.listed_count,\ +.time_zone\ +" + +for i in "$@" +do + OUTPUT=$i.users.csv + echo \#$USER_FIELDS > $OUTPUT + jq -r ".created_at as \$created_at | .user,.retweeted_status.user | select(. != null) | [$USER_FIELDS] | @csv " $i | pv -N "$i" -l >> $OUTPUT +done diff --git a/bin/extract.sh b/bin/extract.sh new file mode 100755 index 0000000..ef21cf3 --- /dev/null +++ b/bin/extract.sh @@ -0,0 +1,32 @@ +if [ "$#" -lt 1 ] +then + echo "Usage: $0 " + exit 1 +fi + +FIELDS=".id_str,\ + .user.screen_name,\ + .user.id,\ + .favorite_count,\ + .retweet_count,\ + .quote_count,\ + .reply_count,\ + .created_at,\ + .lang,\ + .in_reply_to_user_id_str,\ + .in_reply_to_status_id_str,\ + .retweeted_status.id_str,\ + .retweeted_status.user.id,\ + .retweeted_status.favorite_count,\ + .retweeted_status.retweet_count,\ + .retweeted_status.quote_count,\ + .retweeted_status.reply_count,\ + .retweeted_status.created_at\ +" + +for i in "$@" +do + OUTPUT=$i.tweets.csv + echo "$FIELDS" | sed -e 's/,[ \t\n]*\./,/g' | sed -e 's/^[#]\?\.//' > $OUTPUT + jq -r "[$FIELDS]|@csv" $i | pv -N "$i" -l >> $OUTPUT +done diff --git a/bin/extract_extended.sh b/bin/extract_extended.sh new file mode 100755 index 0000000..5a6fcd6 --- /dev/null +++ b/bin/extract_extended.sh @@ -0,0 +1,17 @@ +if [ "$#" -lt 1 ] +then + echo "Usage: $0 " + exit 1 +fi + +QUERY='.| select(.retweeted_status != null) | .retweeted_status | .id_str as $rt_id | .extended_tweet | select(. != null) | [$rt_id,.full_text]|@csv' +HEADER='rt_id,full_text' + +for i in "$@" +do + OUTPUT=$i.full_text.csv + echo $HEADER > $OUTPUT + jq "$QUERY" $i | pv -N "$i" -l >> $OUTPUT + sort -u $OUTPUT -o $OUTPUT + sed -ri 's/^"([0-9]+),\\"(.*)\\""$/"\1","\2"/g' $OUTPUT +done diff --git a/bin/extract_text.sh b/bin/extract_text.sh new file mode 100755 index 0000000..ffb93f3 --- /dev/null +++ b/bin/extract_text.sh @@ -0,0 +1,16 @@ +if [ "$#" -lt 1 ] +then + echo "Usage: $0 " + exit 1 +fi + +QUERY='(.full_text // .retweeted_status.full_text) as $text | [ .id_str,$text ] | @csv' +HEADER='id,text' + +for i in "$@" +do + OUTPUT=$i.text.csv + echo $HEADER > $OUTPUT + pv -l -N "$i" $i | jq -r "$QUERY" >> $OUTPUT + # sed -ri s/^"([0-9]+),\\"(.*)\\""$/"\1","\2"/g $OUTPUT +done diff --git a/bin/functions.py b/bin/functions.py new file mode 100644 index 0000000..80ee4ed --- /dev/null +++ b/bin/functions.py @@ -0,0 +1,23 @@ +import pandas as pd + +def read_rts(rtsfile, tweetsfile): + tweets = pd.read_csv(tweetsfile, index_col=0) + rts = pd.read_csv(rtsfile, index_col=1) + merged = rts.groupby(by=['id', 'rt_user_id']).size().rename('count').reset_index(level=1).merge(tweets, left_index=True, right_index=True) + return merged.sort_values(by='count', ascending=False) + + +def read_tweets(tweetsfile): + '''When the dataset is small enough, we can load tweets as-in''' + with open(tweetsfile) as f: + header = f.readline().strip().split(',') + dtypes = {} + for key in header: + if key.endswith('_str') or key.endswith('.id'): + dtypes[key] = object + tweets = pd.read_csv(tweetsfile, dtype=dtypes, index_col=0) + return tweets + + +if __name__ == '__main__': + import argparse diff --git a/bin/print-hashtags.sh b/bin/print-hashtags.sh new file mode 100755 index 0000000..cca0a53 --- /dev/null +++ b/bin/print-hashtags.sh @@ -0,0 +1 @@ +cat "$@" | awk -F"," '{print tolower($3)}' | sort | uniq -c | sort -h diff --git a/bin/print-replies.sh b/bin/print-replies.sh new file mode 100755 index 0000000..59125dd --- /dev/null +++ b/bin/print-replies.sh @@ -0,0 +1,14 @@ +MAX_TAGS=100 + +function get_text { + while read line + do + echo $line + rtid=$(echo $line | awk -F"," '{print $2}') + text=$(grep -m 1 $rtid *.text.csv) + echo "$line - $text" + done < "/dev/stdin" +} + +cat "$@" | get_text + diff --git a/bin/print-rts.sh b/bin/print-rts.sh new file mode 100755 index 0000000..1c5eb82 --- /dev/null +++ b/bin/print-rts.sh @@ -0,0 +1,15 @@ +MAX_TAGS=100 + +function get_text { + while read line + do + echo $line + rtid=$(echo $line | awk '{print $2}') + count=$(echo $line | awk '{print $1}') + text=$(grep -m 1 $rtid *.text.csv) + echo "$line - $text" + done < "/dev/stdin" +} + +cat "$@" | awk -F"," '{print tolower($2)}' | sort | uniq -c | sort -h | tail -n $MAX_TAGS | get_text + diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..70c6be0 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,12 @@ +version: '2' +services: + dev: + build: + context: . + dockerfile: Dockerfile-3.4 + volumes: + - '.:/usr/src/app' + tty: yes + working_dir: '/usr/src/app' + entrypoint: '/bin/bash' + command: ''