From 53bb7edabca59919500167abe8c0cc27c3381907 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=2E=20Fernando=20S=C3=A1nchez?= <balkian@gmail.com>
Date: Mon, 19 Mar 2018 14:35:07 +0100
Subject: [PATCH] Add sh scripts

---
 bin/README.md               | 10 ++++++++++
 bin/example_query.sh        |  1 +
 bin/extract-hashtags.sh     | 13 +++++++++++++
 bin/extract-interactions.sh | 15 +++++++++++++++
 bin/extract-limits.sh       | 16 ++++++++++++++++
 bin/extract-media.sh        | 16 ++++++++++++++++
 bin/extract-users.sh        | 28 ++++++++++++++++++++++++++++
 bin/extract.sh              | 32 ++++++++++++++++++++++++++++++++
 bin/extract_extended.sh     | 17 +++++++++++++++++
 bin/extract_text.sh         | 16 ++++++++++++++++
 bin/functions.py            | 23 +++++++++++++++++++++++
 bin/print-hashtags.sh       |  1 +
 bin/print-replies.sh        | 14 ++++++++++++++
 bin/print-rts.sh            | 15 +++++++++++++++
 docker-compose.yml          | 12 ++++++++++++
 15 files changed, 229 insertions(+)
 create mode 100644 bin/README.md
 create mode 100755 bin/example_query.sh
 create mode 100755 bin/extract-hashtags.sh
 create mode 100755 bin/extract-interactions.sh
 create mode 100755 bin/extract-limits.sh
 create mode 100755 bin/extract-media.sh
 create mode 100755 bin/extract-users.sh
 create mode 100755 bin/extract.sh
 create mode 100755 bin/extract_extended.sh
 create mode 100755 bin/extract_text.sh
 create mode 100644 bin/functions.py
 create mode 100755 bin/print-hashtags.sh
 create mode 100755 bin/print-replies.sh
 create mode 100755 bin/print-rts.sh
 create mode 100644 docker-compose.yml
diff --git a/bin/README.md b/bin/README.md
new file mode 100644
index 0000000..4928370
--- /dev/null
+++ b/bin/README.md
@@ -0,0 +1,10 @@
+Scripts to process jsonlines
+
+To get the jsonlines file, you can use the streaming API or the search api, like so:
+
+```
+python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines
+```
+
+To keep track of the query that generated the file, you can save the command in a text file.
+For instance, the example above is also in `example_query.sh`.
diff --git a/bin/example_query.sh b/bin/example_query.sh
new file mode 100755
index 0000000..df64ba0
--- /dev/null
+++ b/bin/example_query.sh
@@ -0,0 +1 @@
+python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines
diff --git a/bin/extract-hashtags.sh b/bin/extract-hashtags.sh
new file mode 100755
index 0000000..0d74435
--- /dev/null
+++ b/bin/extract-hashtags.sh
@@ -0,0 +1,13 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+export FIELDS="created_at,id,text" 
+for i in "$@"
+do
+  OUTPUT=$i.hashtags.csv
+  echo "$FIELDS" > $OUTPUT
+  pv -l $i -N "hashtags $i" | jq -r '. | .created_at as $created_at | .id_str as $id | .entities.hashtags | select(. != null) | .[] | [$created_at, $id, .text] | @csv' >> $OUTPUT
+done
diff --git a/bin/extract-interactions.sh b/bin/extract-interactions.sh
new file mode 100755
index 0000000..f132778
--- /dev/null
+++ b/bin/extract-interactions.sh
@@ -0,0 +1,15 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+for i in "$@"
+do
+  REPLYOUTPUT=$i.replies.csv
+  RTOUTPUT=$i.rts.csv
+  echo 'created_at,id,user_id,reply_user_id' > $REPLYOUTPUT
+  echo 'created_at,id,user_id,rt_user_id' > $RTOUTPUT
+  pv -l -N "$i" $i | jq -r '. | select(.in_reply_to_user_id_str != null) | [.created_at, .id_str, .user.id_str, .in_reply_to_user_id_str] | @csv' >> $REPLYOUTPUT
+  pv -l -N "$i" $i | jq -r '. | select(.retweeted_status != null) | [.created_at, .retweeted_status.id_str, .user.id_str, .retweeted_status.user.id_str] | @csv' >> $RTOUTPUT
+done
diff --git a/bin/extract-limits.sh b/bin/extract-limits.sh
new file mode 100755
index 0000000..dab09bf
--- /dev/null
+++ b/bin/extract-limits.sh
@@ -0,0 +1,16 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+export QUERY='.limit | select(. != null) | [.timestamp_ms, .track] | @csv'
+
+export FIELDS="timestamp,track"
+
+for i in "$@"
+do
+  OUTPUT=$i.limits.csv
+  echo $FIELDS > $OUTPUT
+  pv -N "$i limits" -l $i | jq -r "$QUERY" >> $OUTPUT
+done
diff --git a/bin/extract-media.sh b/bin/extract-media.sh
new file mode 100755
index 0000000..baf0e7e
--- /dev/null
+++ b/bin/extract-media.sh
@@ -0,0 +1,16 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+export QUERY='select(.id != null) | .id_str as $id | .entities.urls[] | select(.expanded_url | select(. != null) |  contains("open.spotify") or contains("youtube.com") or contains("youtu.be")) | [$id, .expanded_url] | @csv'
+
+export FIELDS="id,url"
+
+for i in "$@"
+do
+  OUTPUT=$i.media.csv
+  echo $FIELDS > $OUTPUT
+  pv -N "$i media" -l $i | jq -r "$QUERY" >> $OUTPUT
+done
diff --git a/bin/extract-users.sh b/bin/extract-users.sh
new file mode 100755
index 0000000..dca193e
--- /dev/null
+++ b/bin/extract-users.sh
@@ -0,0 +1,28 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+export USER_FIELDS="\$created_at,\
+.id_str,\
+.screen_name,\
+.followers_count,\
+.lang,\
+.description,\
+.statuses_count,\
+.favourites_count,\
+.friends_count,\
+.created_at,\
+.name,\
+.location,\
+.listed_count,\
+.time_zone\
+"
+
+for i in "$@"
+do
+  OUTPUT=$i.users.csv
+  echo \#$USER_FIELDS > $OUTPUT
+  jq -r ".created_at as \$created_at | .user,.retweeted_status.user | select(. != null) | [$USER_FIELDS] | @csv " $i | pv -N "$i" -l >> $OUTPUT
+done
diff --git a/bin/extract.sh b/bin/extract.sh
new file mode 100755
index 0000000..ef21cf3
--- /dev/null
+++ b/bin/extract.sh
@@ -0,0 +1,32 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+FIELDS=".id_str,\
+        .user.screen_name,\
+        .user.id,\
+        .favorite_count,\
+        .retweet_count,\
+        .quote_count,\
+        .reply_count,\
+        .created_at,\
+        .lang,\
+        .in_reply_to_user_id_str,\
+        .in_reply_to_status_id_str,\
+        .retweeted_status.id_str,\
+        .retweeted_status.user.id,\
+        .retweeted_status.favorite_count,\
+        .retweeted_status.retweet_count,\
+        .retweeted_status.quote_count,\
+        .retweeted_status.reply_count,\
+        .retweeted_status.created_at\
+"
+
+for i in "$@"
+do
+  OUTPUT=$i.tweets.csv
+  echo "$FIELDS" | sed -e 's/,[ \t\n]*\./,/g' | sed -e 's/^[#]\?\.//' > $OUTPUT
+  jq -r "[$FIELDS]|@csv" $i | pv -N "$i" -l >> $OUTPUT
+done
diff --git a/bin/extract_extended.sh b/bin/extract_extended.sh
new file mode 100755
index 0000000..5a6fcd6
--- /dev/null
+++ b/bin/extract_extended.sh
@@ -0,0 +1,17 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+QUERY='.| select(.retweeted_status != null) | .retweeted_status | .id_str as $rt_id | .extended_tweet | select(. != null) | [$rt_id,.full_text]|@csv'
+HEADER='rt_id,full_text'
+
+for i in "$@"
+do
+  OUTPUT=$i.full_text.csv
+  echo $HEADER > $OUTPUT
+  jq "$QUERY" $i | pv -N "$i" -l >> $OUTPUT
+  sort -u $OUTPUT -o $OUTPUT
+  sed -ri 's/^"([0-9]+),\\"(.*)\\""$/"\1","\2"/g' $OUTPUT
+done
diff --git a/bin/extract_text.sh b/bin/extract_text.sh
new file mode 100755
index 0000000..ffb93f3
--- /dev/null
+++ b/bin/extract_text.sh
@@ -0,0 +1,16 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+QUERY='(.full_text // .retweeted_status.full_text) as $text | [ .id_str,$text ] | @csv'
+HEADER='id,text'
+
+for i in "$@"
+do
+  OUTPUT=$i.text.csv
+  echo $HEADER > $OUTPUT
+  pv -l -N "$i" $i | jq -r "$QUERY" >> $OUTPUT
+ # sed -ri s/^"([0-9]+),\\"(.*)\\""$/"\1","\2"/g $OUTPUT
+done
diff --git a/bin/functions.py b/bin/functions.py
new file mode 100644
index 0000000..80ee4ed
--- /dev/null
+++ b/bin/functions.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+def read_rts(rtsfile, tweetsfile):
+    tweets = pd.read_csv(tweetsfile, index_col=0)
+    rts = pd.read_csv(rtsfile, index_col=1)
+    merged = rts.groupby(by=['id', 'rt_user_id']).size().rename('count').reset_index(level=1).merge(tweets, left_index=True, right_index=True)
+    return merged.sort_values(by='count', ascending=False)
+
+
+def read_tweets(tweetsfile):
+    '''When the dataset is small enough, we can load tweets as-in'''
+    with open(tweetsfile) as f:
+        header = f.readline().strip().split(',')
+        dtypes = {}
+    for key in header:
+        if key.endswith('_str') or key.endswith('.id'):
+            dtypes[key] = object 
+            tweets = pd.read_csv(tweetsfile, dtype=dtypes, index_col=0)
+    return tweets
+
+
+if __name__ == '__main__':
+    import argparse
diff --git a/bin/print-hashtags.sh b/bin/print-hashtags.sh
new file mode 100755
index 0000000..cca0a53
--- /dev/null
+++ b/bin/print-hashtags.sh
@@ -0,0 +1 @@
+cat "$@" | awk -F"," '{print tolower($3)}' | sort | uniq -c | sort -h 
diff --git a/bin/print-replies.sh b/bin/print-replies.sh
new file mode 100755
index 0000000..59125dd
--- /dev/null
+++ b/bin/print-replies.sh
@@ -0,0 +1,14 @@
+MAX_TAGS=100
+
+function get_text {
+    while read line
+    do
+        echo $line
+        rtid=$(echo $line | awk -F"," '{print $2}')
+        text=$(grep -m 1 $rtid *.text.csv)
+        echo "$line - $text"
+    done < "/dev/stdin"
+}
+
+cat "$@" | get_text
+
diff --git a/bin/print-rts.sh b/bin/print-rts.sh
new file mode 100755
index 0000000..1c5eb82
--- /dev/null
+++ b/bin/print-rts.sh
@@ -0,0 +1,15 @@
+MAX_TAGS=100
+
+function get_text {
+    while read line
+    do
+        echo $line
+        rtid=$(echo $line | awk '{print $2}')
+        count=$(echo $line | awk '{print $1}')
+        text=$(grep -m 1 $rtid *.text.csv)
+        echo "$line - $text"
+    done < "/dev/stdin"
+}
+
+cat "$@" | awk -F"," '{print tolower($2)}' | sort | uniq -c | sort -h | tail -n $MAX_TAGS | get_text
+
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..70c6be0
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,12 @@
+version: '2'
+services:
+  dev:
+    build:
+      context: .
+      dockerfile: Dockerfile-3.4
+    volumes:
+      - '.:/usr/src/app'
+    tty: yes
+    working_dir: '/usr/src/app'
+    entrypoint: '/bin/bash'
+    command: ''