Add sh scripts

2025-12-10 01:08:16 +00:00 · 2018-03-19 14:35:07 +01:00
parent 57eb73b53b
commit 53bb7edabc
15 changed files with 229 additions and 0 deletions
--- a/bin/README.md
+++ b/bin/README.md
@@ -0,0 +1,10 @@
+Scripts to process jsonlines
+
+To get the jsonlines file, you can use the streaming API or the search api, like so:
+
+```
+python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines
+```
+
+To keep track of the query that generated the file, you can save the command in a text file.
+For instance, the example above is also in `example_query.sh`.
--- a/bin/example_query.sh
+++ b/bin/example_query.sh
@@ -0,0 +1 @@
+python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines
--- a/bin/extract-hashtags.sh
+++ b/bin/extract-hashtags.sh
@@ -0,0 +1,13 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+export FIELDS="created_at,id,text" 
+for i in "$@"
+do
+  OUTPUT=$i.hashtags.csv
+  echo "$FIELDS" > $OUTPUT
+  pv -l $i -N "hashtags $i" | jq -r '. | .created_at as $created_at | .id_str as $id | .entities.hashtags | select(. != null) | .[] | [$created_at, $id, .text] | @csv' >> $OUTPUT
+done
--- a/bin/extract-interactions.sh
+++ b/bin/extract-interactions.sh
@@ -0,0 +1,15 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+for i in "$@"
+do
+  REPLYOUTPUT=$i.replies.csv
+  RTOUTPUT=$i.rts.csv
+  echo 'created_at,id,user_id,reply_user_id' > $REPLYOUTPUT
+  echo 'created_at,id,user_id,rt_user_id' > $RTOUTPUT
+  pv -l -N "$i" $i | jq -r '. | select(.in_reply_to_user_id_str != null) | [.created_at, .id_str, .user.id_str, .in_reply_to_user_id_str] | @csv' >> $REPLYOUTPUT
+  pv -l -N "$i" $i | jq -r '. | select(.retweeted_status != null) | [.created_at, .retweeted_status.id_str, .user.id_str, .retweeted_status.user.id_str] | @csv' >> $RTOUTPUT
+done
--- a/bin/extract-limits.sh
+++ b/bin/extract-limits.sh
@@ -0,0 +1,16 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+export QUERY='.limit | select(. != null) | [.timestamp_ms, .track] | @csv'
+
+export FIELDS="timestamp,track"
+
+for i in "$@"
+do
+  OUTPUT=$i.limits.csv
+  echo $FIELDS > $OUTPUT
+  pv -N "$i limits" -l $i | jq -r "$QUERY" >> $OUTPUT
+done
--- a/bin/extract-media.sh
+++ b/bin/extract-media.sh
@@ -0,0 +1,16 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+export QUERY='select(.id != null) | .id_str as $id | .entities.urls[] | select(.expanded_url | select(. != null) |  contains("open.spotify") or contains("youtube.com") or contains("youtu.be")) | [$id, .expanded_url] | @csv'
+
+export FIELDS="id,url"
+
+for i in "$@"
+do
+  OUTPUT=$i.media.csv
+  echo $FIELDS > $OUTPUT
+  pv -N "$i media" -l $i | jq -r "$QUERY" >> $OUTPUT
+done
--- a/bin/extract-users.sh
+++ b/bin/extract-users.sh
@@ -0,0 +1,28 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+export USER_FIELDS="\$created_at,\
+.id_str,\
+.screen_name,\
+.followers_count,\
+.lang,\
+.description,\
+.statuses_count,\
+.favourites_count,\
+.friends_count,\
+.created_at,\
+.name,\
+.location,\
+.listed_count,\
+.time_zone\
+"
+
+for i in "$@"
+do
+  OUTPUT=$i.users.csv
+  echo \#$USER_FIELDS > $OUTPUT
+  jq -r ".created_at as \$created_at | .user,.retweeted_status.user | select(. != null) | [$USER_FIELDS] | @csv " $i | pv -N "$i" -l >> $OUTPUT
+done
--- a/bin/extract.sh
+++ b/bin/extract.sh
@@ -0,0 +1,32 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+FIELDS=".id_str,\
+        .user.screen_name,\
+        .user.id,\
+        .favorite_count,\
+        .retweet_count,\
+        .quote_count,\
+        .reply_count,\
+        .created_at,\
+        .lang,\
+        .in_reply_to_user_id_str,\
+        .in_reply_to_status_id_str,\
+        .retweeted_status.id_str,\
+        .retweeted_status.user.id,\
+        .retweeted_status.favorite_count,\
+        .retweeted_status.retweet_count,\
+        .retweeted_status.quote_count,\
+        .retweeted_status.reply_count,\
+        .retweeted_status.created_at\
+"
+
+for i in "$@"
+do
+  OUTPUT=$i.tweets.csv
+  echo "$FIELDS" | sed -e 's/,[ \t\n]*\./,/g' | sed -e 's/^[#]\?\.//' > $OUTPUT
+  jq -r "[$FIELDS]|@csv" $i | pv -N "$i" -l >> $OUTPUT
+done
--- a/bin/extract_extended.sh
+++ b/bin/extract_extended.sh
@@ -0,0 +1,17 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+QUERY='.| select(.retweeted_status != null) | .retweeted_status | .id_str as $rt_id | .extended_tweet | select(. != null) | [$rt_id,.full_text]|@csv'
+HEADER='rt_id,full_text'
+
+for i in "$@"
+do
+  OUTPUT=$i.full_text.csv
+  echo $HEADER > $OUTPUT
+  jq "$QUERY" $i | pv -N "$i" -l >> $OUTPUT
+  sort -u $OUTPUT -o $OUTPUT
+  sed -ri 's/^"([0-9]+),\\"(.*)\\""$/"\1","\2"/g' $OUTPUT
+done
--- a/bin/extract_text.sh
+++ b/bin/extract_text.sh
@@ -0,0 +1,16 @@
+if [ "$#" -lt 1 ]
+then
+	echo "Usage: $0 <files to convert>"
+	exit 1
+fi
+
+QUERY='(.full_text // .retweeted_status.full_text) as $text | [ .id_str,$text ] | @csv'
+HEADER='id,text'
+
+for i in "$@"
+do
+  OUTPUT=$i.text.csv
+  echo $HEADER > $OUTPUT
+  pv -l -N "$i" $i | jq -r "$QUERY" >> $OUTPUT
+ # sed -ri s/^"([0-9]+),\\"(.*)\\""$/"\1","\2"/g $OUTPUT
+done
--- a/bin/functions.py
+++ b/bin/functions.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+def read_rts(rtsfile, tweetsfile):
+    tweets = pd.read_csv(tweetsfile, index_col=0)
+    rts = pd.read_csv(rtsfile, index_col=1)
+    merged = rts.groupby(by=['id', 'rt_user_id']).size().rename('count').reset_index(level=1).merge(tweets, left_index=True, right_index=True)
+    return merged.sort_values(by='count', ascending=False)
+
+
+def read_tweets(tweetsfile):
+    '''When the dataset is small enough, we can load tweets as-in'''
+    with open(tweetsfile) as f:
+        header = f.readline().strip().split(',')
+        dtypes = {}
+    for key in header:
+        if key.endswith('_str') or key.endswith('.id'):
+            dtypes[key] = object 
+            tweets = pd.read_csv(tweetsfile, dtype=dtypes, index_col=0)
+    return tweets
+
+
+if __name__ == '__main__':
+    import argparse
--- a/bin/print-hashtags.sh
+++ b/bin/print-hashtags.sh
@@ -0,0 +1 @@
+cat "$@" | awk -F"," '{print tolower($3)}' | sort | uniq -c | sort -h 
--- a/bin/print-replies.sh
+++ b/bin/print-replies.sh
@@ -0,0 +1,14 @@
+MAX_TAGS=100
+
+function get_text {
+    while read line
+    do
+        echo $line
+        rtid=$(echo $line | awk -F"," '{print $2}')
+        text=$(grep -m 1 $rtid *.text.csv)
+        echo "$line - $text"
+    done < "/dev/stdin"
+}
+
+cat "$@" | get_text
+
--- a/bin/print-rts.sh
+++ b/bin/print-rts.sh
@@ -0,0 +1,15 @@
+MAX_TAGS=100
+
+function get_text {
+    while read line
+    do
+        echo $line
+        rtid=$(echo $line | awk '{print $2}')
+        count=$(echo $line | awk '{print $1}')
+        text=$(grep -m 1 $rtid *.text.csv)
+        echo "$line - $text"
+    done < "/dev/stdin"
+}
+
+cat "$@" | awk -F"," '{print tolower($2)}' | sort | uniq -c | sort -h | tail -n $MAX_TAGS | get_text
+
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,12 @@
+version: '2'
+services:
+  dev:
+    build:
+      context: .
+      dockerfile: Dockerfile-3.4
+    volumes:
+      - '.:/usr/src/app'
+    tty: yes
+    working_dir: '/usr/src/app'
+    entrypoint: '/bin/bash'
+    command: ''
				`@@ -0,0 +1 @@`
				`python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines`
				`@@ -0,0 +1 @@`
				`cat "$@" \| awk -F"," '{print tolower($3)}' \| sort \| uniq -c \| sort -h`