mirror of
https://github.com/balkian/bitter.git
synced 2024-12-22 00:18:12 +00:00
Add sh scripts
This commit is contained in:
parent
57eb73b53b
commit
53bb7edabc
10
bin/README.md
Normal file
10
bin/README.md
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
Scripts to process jsonlines
|
||||||
|
|
||||||
|
To get the jsonlines file, you can use the streaming API or the search api, like so:
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines
|
||||||
|
```
|
||||||
|
|
||||||
|
To keep track of the query that generated the file, you can save the command in a text file.
|
||||||
|
For instance, the example above is also in `example_query.sh`.
|
1
bin/example_query.sh
Executable file
1
bin/example_query.sh
Executable file
@ -0,0 +1 @@
|
|||||||
|
python -m bitter.cli --config .bitter.yaml api '/search/tweets' --result_type recent --q 'bitter OR #bitter OR @bitter' --tweet_mode extended --tweets --max_count 5000 >> mytweets.jsonlines
|
13
bin/extract-hashtags.sh
Executable file
13
bin/extract-hashtags.sh
Executable file
@ -0,0 +1,13 @@
|
|||||||
|
if [ "$#" -lt 1 ]
|
||||||
|
then
|
||||||
|
echo "Usage: $0 <files to convert>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
export FIELDS="created_at,id,text"
|
||||||
|
for i in "$@"
|
||||||
|
do
|
||||||
|
OUTPUT=$i.hashtags.csv
|
||||||
|
echo "$FIELDS" > $OUTPUT
|
||||||
|
pv -l $i -N "hashtags $i" | jq -r '. | .created_at as $created_at | .id_str as $id | .entities.hashtags | select(. != null) | .[] | [$created_at, $id, .text] | @csv' >> $OUTPUT
|
||||||
|
done
|
15
bin/extract-interactions.sh
Executable file
15
bin/extract-interactions.sh
Executable file
@ -0,0 +1,15 @@
|
|||||||
|
if [ "$#" -lt 1 ]
|
||||||
|
then
|
||||||
|
echo "Usage: $0 <files to convert>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
for i in "$@"
|
||||||
|
do
|
||||||
|
REPLYOUTPUT=$i.replies.csv
|
||||||
|
RTOUTPUT=$i.rts.csv
|
||||||
|
echo 'created_at,id,user_id,reply_user_id' > $REPLYOUTPUT
|
||||||
|
echo 'created_at,id,user_id,rt_user_id' > $RTOUTPUT
|
||||||
|
pv -l -N "$i" $i | jq -r '. | select(.in_reply_to_user_id_str != null) | [.created_at, .id_str, .user.id_str, .in_reply_to_user_id_str] | @csv' >> $REPLYOUTPUT
|
||||||
|
pv -l -N "$i" $i | jq -r '. | select(.retweeted_status != null) | [.created_at, .retweeted_status.id_str, .user.id_str, .retweeted_status.user.id_str] | @csv' >> $RTOUTPUT
|
||||||
|
done
|
16
bin/extract-limits.sh
Executable file
16
bin/extract-limits.sh
Executable file
@ -0,0 +1,16 @@
|
|||||||
|
if [ "$#" -lt 1 ]
|
||||||
|
then
|
||||||
|
echo "Usage: $0 <files to convert>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
export QUERY='.limit | select(. != null) | [.timestamp_ms, .track] | @csv'
|
||||||
|
|
||||||
|
export FIELDS="timestamp,track"
|
||||||
|
|
||||||
|
for i in "$@"
|
||||||
|
do
|
||||||
|
OUTPUT=$i.limits.csv
|
||||||
|
echo $FIELDS > $OUTPUT
|
||||||
|
pv -N "$i limits" -l $i | jq -r "$QUERY" >> $OUTPUT
|
||||||
|
done
|
16
bin/extract-media.sh
Executable file
16
bin/extract-media.sh
Executable file
@ -0,0 +1,16 @@
|
|||||||
|
if [ "$#" -lt 1 ]
|
||||||
|
then
|
||||||
|
echo "Usage: $0 <files to convert>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
export QUERY='select(.id != null) | .id_str as $id | .entities.urls[] | select(.expanded_url | select(. != null) | contains("open.spotify") or contains("youtube.com") or contains("youtu.be")) | [$id, .expanded_url] | @csv'
|
||||||
|
|
||||||
|
export FIELDS="id,url"
|
||||||
|
|
||||||
|
for i in "$@"
|
||||||
|
do
|
||||||
|
OUTPUT=$i.media.csv
|
||||||
|
echo $FIELDS > $OUTPUT
|
||||||
|
pv -N "$i media" -l $i | jq -r "$QUERY" >> $OUTPUT
|
||||||
|
done
|
28
bin/extract-users.sh
Executable file
28
bin/extract-users.sh
Executable file
@ -0,0 +1,28 @@
|
|||||||
|
if [ "$#" -lt 1 ]
|
||||||
|
then
|
||||||
|
echo "Usage: $0 <files to convert>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
export USER_FIELDS="\$created_at,\
|
||||||
|
.id_str,\
|
||||||
|
.screen_name,\
|
||||||
|
.followers_count,\
|
||||||
|
.lang,\
|
||||||
|
.description,\
|
||||||
|
.statuses_count,\
|
||||||
|
.favourites_count,\
|
||||||
|
.friends_count,\
|
||||||
|
.created_at,\
|
||||||
|
.name,\
|
||||||
|
.location,\
|
||||||
|
.listed_count,\
|
||||||
|
.time_zone\
|
||||||
|
"
|
||||||
|
|
||||||
|
for i in "$@"
|
||||||
|
do
|
||||||
|
OUTPUT=$i.users.csv
|
||||||
|
echo \#$USER_FIELDS > $OUTPUT
|
||||||
|
jq -r ".created_at as \$created_at | .user,.retweeted_status.user | select(. != null) | [$USER_FIELDS] | @csv " $i | pv -N "$i" -l >> $OUTPUT
|
||||||
|
done
|
32
bin/extract.sh
Executable file
32
bin/extract.sh
Executable file
@ -0,0 +1,32 @@
|
|||||||
|
if [ "$#" -lt 1 ]
|
||||||
|
then
|
||||||
|
echo "Usage: $0 <files to convert>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
FIELDS=".id_str,\
|
||||||
|
.user.screen_name,\
|
||||||
|
.user.id,\
|
||||||
|
.favorite_count,\
|
||||||
|
.retweet_count,\
|
||||||
|
.quote_count,\
|
||||||
|
.reply_count,\
|
||||||
|
.created_at,\
|
||||||
|
.lang,\
|
||||||
|
.in_reply_to_user_id_str,\
|
||||||
|
.in_reply_to_status_id_str,\
|
||||||
|
.retweeted_status.id_str,\
|
||||||
|
.retweeted_status.user.id,\
|
||||||
|
.retweeted_status.favorite_count,\
|
||||||
|
.retweeted_status.retweet_count,\
|
||||||
|
.retweeted_status.quote_count,\
|
||||||
|
.retweeted_status.reply_count,\
|
||||||
|
.retweeted_status.created_at\
|
||||||
|
"
|
||||||
|
|
||||||
|
for i in "$@"
|
||||||
|
do
|
||||||
|
OUTPUT=$i.tweets.csv
|
||||||
|
echo "$FIELDS" | sed -e 's/,[ \t\n]*\./,/g' | sed -e 's/^[#]\?\.//' > $OUTPUT
|
||||||
|
jq -r "[$FIELDS]|@csv" $i | pv -N "$i" -l >> $OUTPUT
|
||||||
|
done
|
17
bin/extract_extended.sh
Executable file
17
bin/extract_extended.sh
Executable file
@ -0,0 +1,17 @@
|
|||||||
|
if [ "$#" -lt 1 ]
|
||||||
|
then
|
||||||
|
echo "Usage: $0 <files to convert>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
QUERY='.| select(.retweeted_status != null) | .retweeted_status | .id_str as $rt_id | .extended_tweet | select(. != null) | [$rt_id,.full_text]|@csv'
|
||||||
|
HEADER='rt_id,full_text'
|
||||||
|
|
||||||
|
for i in "$@"
|
||||||
|
do
|
||||||
|
OUTPUT=$i.full_text.csv
|
||||||
|
echo $HEADER > $OUTPUT
|
||||||
|
jq "$QUERY" $i | pv -N "$i" -l >> $OUTPUT
|
||||||
|
sort -u $OUTPUT -o $OUTPUT
|
||||||
|
sed -ri 's/^"([0-9]+),\\"(.*)\\""$/"\1","\2"/g' $OUTPUT
|
||||||
|
done
|
16
bin/extract_text.sh
Executable file
16
bin/extract_text.sh
Executable file
@ -0,0 +1,16 @@
|
|||||||
|
if [ "$#" -lt 1 ]
|
||||||
|
then
|
||||||
|
echo "Usage: $0 <files to convert>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
QUERY='(.full_text // .retweeted_status.full_text) as $text | [ .id_str,$text ] | @csv'
|
||||||
|
HEADER='id,text'
|
||||||
|
|
||||||
|
for i in "$@"
|
||||||
|
do
|
||||||
|
OUTPUT=$i.text.csv
|
||||||
|
echo $HEADER > $OUTPUT
|
||||||
|
pv -l -N "$i" $i | jq -r "$QUERY" >> $OUTPUT
|
||||||
|
# sed -ri s/^"([0-9]+),\\"(.*)\\""$/"\1","\2"/g $OUTPUT
|
||||||
|
done
|
23
bin/functions.py
Normal file
23
bin/functions.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
def read_rts(rtsfile, tweetsfile):
|
||||||
|
tweets = pd.read_csv(tweetsfile, index_col=0)
|
||||||
|
rts = pd.read_csv(rtsfile, index_col=1)
|
||||||
|
merged = rts.groupby(by=['id', 'rt_user_id']).size().rename('count').reset_index(level=1).merge(tweets, left_index=True, right_index=True)
|
||||||
|
return merged.sort_values(by='count', ascending=False)
|
||||||
|
|
||||||
|
|
||||||
|
def read_tweets(tweetsfile):
|
||||||
|
'''When the dataset is small enough, we can load tweets as-in'''
|
||||||
|
with open(tweetsfile) as f:
|
||||||
|
header = f.readline().strip().split(',')
|
||||||
|
dtypes = {}
|
||||||
|
for key in header:
|
||||||
|
if key.endswith('_str') or key.endswith('.id'):
|
||||||
|
dtypes[key] = object
|
||||||
|
tweets = pd.read_csv(tweetsfile, dtype=dtypes, index_col=0)
|
||||||
|
return tweets
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import argparse
|
1
bin/print-hashtags.sh
Executable file
1
bin/print-hashtags.sh
Executable file
@ -0,0 +1 @@
|
|||||||
|
cat "$@" | awk -F"," '{print tolower($3)}' | sort | uniq -c | sort -h
|
14
bin/print-replies.sh
Executable file
14
bin/print-replies.sh
Executable file
@ -0,0 +1,14 @@
|
|||||||
|
MAX_TAGS=100
|
||||||
|
|
||||||
|
function get_text {
|
||||||
|
while read line
|
||||||
|
do
|
||||||
|
echo $line
|
||||||
|
rtid=$(echo $line | awk -F"," '{print $2}')
|
||||||
|
text=$(grep -m 1 $rtid *.text.csv)
|
||||||
|
echo "$line - $text"
|
||||||
|
done < "/dev/stdin"
|
||||||
|
}
|
||||||
|
|
||||||
|
cat "$@" | get_text
|
||||||
|
|
15
bin/print-rts.sh
Executable file
15
bin/print-rts.sh
Executable file
@ -0,0 +1,15 @@
|
|||||||
|
MAX_TAGS=100
|
||||||
|
|
||||||
|
function get_text {
|
||||||
|
while read line
|
||||||
|
do
|
||||||
|
echo $line
|
||||||
|
rtid=$(echo $line | awk '{print $2}')
|
||||||
|
count=$(echo $line | awk '{print $1}')
|
||||||
|
text=$(grep -m 1 $rtid *.text.csv)
|
||||||
|
echo "$line - $text"
|
||||||
|
done < "/dev/stdin"
|
||||||
|
}
|
||||||
|
|
||||||
|
cat "$@" | awk -F"," '{print tolower($2)}' | sort | uniq -c | sort -h | tail -n $MAX_TAGS | get_text
|
||||||
|
|
12
docker-compose.yml
Normal file
12
docker-compose.yml
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
version: '2'
|
||||||
|
services:
|
||||||
|
dev:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile-3.4
|
||||||
|
volumes:
|
||||||
|
- '.:/usr/src/app'
|
||||||
|
tty: yes
|
||||||
|
working_dir: '/usr/src/app'
|
||||||
|
entrypoint: '/bin/bash'
|
||||||
|
command: ''
|
Loading…
Reference in New Issue
Block a user