mirror of
https://github.com/balkian/gists.git
synced 2024-11-24 10:22:28 +00:00
Rename and download script
This commit is contained in:
parent
8f4c3babce
commit
08690cd72d
43
download.py
Normal file
43
download.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/env python -B
|
||||||
|
|
||||||
|
from github import Github # pip install PyGithub
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
all = []
|
||||||
|
|
||||||
|
g = Github(open(os.path.expanduser(".gist")).read().strip())
|
||||||
|
|
||||||
|
repos = Path("repos")
|
||||||
|
|
||||||
|
for gist in g.get_user().get_gists():
|
||||||
|
if not gist.public:
|
||||||
|
print(f"Not cloning secret gist: {gist.id} - {gist.description}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
path = repos / gist.id
|
||||||
|
if os.path.exists(path):
|
||||||
|
print(f"Repository exists already: {path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
all.append({
|
||||||
|
"id" : gist.id,
|
||||||
|
"description" : gist.description,
|
||||||
|
"public" : gist.public,
|
||||||
|
"clone" : gist.git_pull_url,
|
||||||
|
"updated" : gist.updated_at.isoformat(),
|
||||||
|
"url" : gist.url,
|
||||||
|
})
|
||||||
|
|
||||||
|
# yuck
|
||||||
|
os.system(f"git remote add '{gist.id}' '{gist.git_pull_url}'")
|
||||||
|
cmd = f"git subtree add --prefix '{path}' '{gist.id}' master"
|
||||||
|
print(f'Running {cmd}')
|
||||||
|
os.system(cmd)
|
||||||
|
#os.system(f"git submodule add '{0}' repos/{1}".format(gist.git_pull_url, gist.id))
|
||||||
|
#import pdb;pdb.set_trace()
|
||||||
|
|
||||||
|
|
||||||
|
with open("index.json", "w") as f:
|
||||||
|
f.write(json.dumps(all, indent=4) + "\n")
|
@ -1,2 +0,0 @@
|
|||||||
SCL - D1
|
|
||||||
SDA - D2
|
|
@ -1,13 +0,0 @@
|
|||||||
{
|
|
||||||
"embeddings": [
|
|
||||||
{
|
|
||||||
"tensorName": "300 seeds",
|
|
||||||
"tensorShape": [
|
|
||||||
1975,
|
|
||||||
100
|
|
||||||
],
|
|
||||||
"tensorPath": "https://lab.gsi.upm.es/oaraque/incel-embeddings/raw/master/300_seeds/neologisms_embeddings_2019-06-07_16-29.tsv",
|
|
||||||
"metadataPath": "https://lab.gsi.upm.es/oaraque/incel-embeddings/raw/master/300_seeds/neologisms_embeddings_words_2019-06-07_16-29.txt"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
@ -1,7 +0,0 @@
|
|||||||
# Replace define(ACROREAD, [repeat swallow(acroread) fill :
|
|
||||||
# acroread -openInNewWindow /a "$fragment" "$file"])
|
|
||||||
#
|
|
||||||
# With the following in /etc/mozpluggerrc to have evince as
|
|
||||||
# an embedded pdf viewer in chromium:
|
|
||||||
|
|
||||||
define(ACROREAD, [repeat swallow(evince) fill needs_xembed : evince -f "$file"])
|
|
@ -1 +0,0 @@
|
|||||||
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDZM2cshLida17Ay7EZW6jB9x3jkczT3QF7XJThPmruhv3E3V50mcT6oXf7eunks1XOJtBBJfxYnB93aGt/FJ/f8n44PF6uL+cuj8L9G1tbcIJXUY4HXDN/ewLN/yJqjqax9Sf/d/RV6i3AC+lD8neUsEk7xTzDNaygv+nz40vZ52aHda5AS1hh7XsVQ8SG/2tLvOGSzLT5bhOxqEBVt1CdaBM9dPZhNG8QnMUgEajywFP8OY13q6+boPGpHe2qBKAmIvN6J2uLhcQ6tIt6ODRGyklNY6B8Cagmf0/tCNg/RfzMAWMLFl1NBn3YCszDj/BR32Nw2pSaBzKdAsIyvmfd jfernando@dit.upm.es
|
|
1
repos/Conectar Deckard/gistfile1
Normal file
1
repos/Conectar Deckard/gistfile1
Normal file
@ -0,0 +1 @@
|
|||||||
|
ssh -p 50022 root@deckard.eestec.net
|
170
repos/comparison/comparison.md
Normal file
170
repos/comparison/comparison.md
Normal file
@ -0,0 +1,170 @@
|
|||||||
|
This is a quick comparison of three different approaches to load a big CSV/TSV file into a sqlite database.
|
||||||
|
|
||||||
|
TL;DR sqlite has an `.import` function that works wonders.
|
||||||
|
|
||||||
|
# Results
|
||||||
|
## Python
|
||||||
|
|
||||||
|
```
|
||||||
|
❯ time python2 convert.py sample-twitter_rv.net
|
||||||
|
Done: 9999999 lines. 131128444 / 131128444 bytes (100.0 %)
|
||||||
|
python2 convert.py sample-twitter_rv.net 63.96s user 27.51s system 63% cpu 2:23.53 total
|
||||||
|
```
|
||||||
|
|
||||||
|
This is the output of cProfile:
|
||||||
|
```
|
||||||
|
python2 -m cProfile convert.py sample-twitter_rv.net
|
||||||
|
Done: 9999999 lines. 131128444 / 131128444 bytes (100.0 %)
|
||||||
|
50006080 function calls (50006073 primitive calls) in 241.581 seconds
|
||||||
|
|
||||||
|
Ordered by: cumulative time
|
||||||
|
|
||||||
|
ncalls tottime percall cumtime percall filename:lineno(function)
|
||||||
|
1 0.000 0.000 241.581 241.581 convert.py:1(<module>)
|
||||||
|
1 12.686 12.686 241.576 241.576 convert.py:18(main)
|
||||||
|
10000000 5.114 0.000 157.256 0.000 convert.py:7(addusers)
|
||||||
|
10000004 152.164 0.000 152.164 0.000 {method 'execute' of 'sqlite3.Connection' objects}
|
||||||
|
1001 66.972 0.067 66.972 0.067 {method 'commit' of 'sqlite3.Connection' objects}
|
||||||
|
10000002 2.471 0.000 2.471 0.000 {method 'split' of 'str' objects}
|
||||||
|
10000000 1.103 0.000 1.103 0.000 {method 'strip' of 'str' objects}
|
||||||
|
10000000 1.015 0.000 1.015 0.000 {len}
|
||||||
|
1001 0.032 0.000 0.046 0.000 convert.py:12(update_progress)
|
||||||
|
1001 0.011 0.000 0.011 0.000 {method 'format' of 'str' objects}
|
||||||
|
1000 0.005 0.000 0.005 0.000 {method 'tell' of 'file' objects}
|
||||||
|
1 0.000 0.000 0.005 0.005 __init__.py:24(<module>)
|
||||||
|
1 0.002 0.002 0.004 0.004 dbapi2.py:24(<module>)
|
||||||
|
2002 0.003 0.000 0.003 0.000 {method 'write' of 'file' objects}
|
||||||
|
1 0.001 0.001 0.002 0.002 collections.py:11(<module>)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Golang
|
||||||
|
|
||||||
|
|
||||||
|
### With Transactions
|
||||||
|
|
||||||
|
```
|
||||||
|
❯ time ./twitter-sqlite sample-twitter_rv.net
|
||||||
|
131128444/131128444 Bytes (100.000%) 10000000 lines - 0.00 Bps (avg. 1656955.51 Bps)
|
||||||
|
./sqlite-twitter sample-twitter_rv.net 71.16s user 24.15s system 120% cpu 1:19.15 total
|
||||||
|
```
|
||||||
|
|
||||||
|
Using the pprof module, I could also extract some profiling information:
|
||||||
|
|
||||||
|
```
|
||||||
|
❯ go tool pprof sqlite-twitter tx
|
||||||
|
File: sqlite-twitter
|
||||||
|
Build ID: 708e90eba7948cb0851dfbf3bb6170ccaa418eff
|
||||||
|
Type: cpu
|
||||||
|
Time: Aug 24, 2018 at 4:26pm (CEST)
|
||||||
|
Duration: 1.26mins, Total samples = 1.47mins (117.03%)
|
||||||
|
Entering interactive mode (type "help" for commands, "o" for options)
|
||||||
|
(pprof) cum
|
||||||
|
(pprof) top20
|
||||||
|
Showing nodes accounting for 65.26s, 73.82% of 88.40s total
|
||||||
|
Dropped 222 nodes (cum <= 0.44s)
|
||||||
|
Showing top 20 nodes out of 70
|
||||||
|
flat flat% sum% cum cum%
|
||||||
|
0.09s 0.1% 0.1% 58.96s 66.70% main.main
|
||||||
|
0.44s 0.5% 0.6% 53.39s 60.40% database/sql.(*Tx).StmtContext
|
||||||
|
0.11s 0.12% 0.72% 48.59s 54.97% database/sql.asString
|
||||||
|
0.06s 0.068% 0.79% 47.55s 53.79% github.com/mattn/go-sqlite3.(*SQLiteConn).Prepare
|
||||||
|
43.32s 49.00% 49.80% 44.27s 50.08% runtime.c128hash
|
||||||
|
0.02s 0.023% 49.82% 40.16s 45.43% net/url.Values.Encode
|
||||||
|
0.02s 0.023% 49.84% 39.79s 45.01% github.com/mattn/go-sqlite3.(*SQLiteConn).lastError.func3
|
||||||
|
0.24s 0.27% 50.11% 23.36s 26.43% runtime.findrunnable
|
||||||
|
0.03s 0.034% 50.15% 23.24s 26.29% runtime.casgstatus.func3
|
||||||
|
0.44s 0.5% 50.64% 16.84s 19.05% runtime.forEachP
|
||||||
|
0.17s 0.19% 50.84% 15.17s 17.16% internal/poll.runtime_pollSetDeadline
|
||||||
|
14.98s 16.95% 67.78% 14.98s 16.95% runtime.duffcopy
|
||||||
|
0.41s 0.46% 68.25% 10.67s 12.07% runtime.startm
|
||||||
|
0.17s 0.19% 68.44% 8.62s 9.75% runtime.panicdottypeI
|
||||||
|
0.11s 0.12% 68.56% 7.25s 8.20% runtime.needm
|
||||||
|
0.31s 0.35% 68.91% 6.97s 7.88% runtime.panicdottypeE
|
||||||
|
0.33s 0.37% 69.29% 6.90s 7.81% github.com/mattn/go-sqlite3.(*SQLiteDriver).Open
|
||||||
|
1.81s 2.05% 71.33% 6.38s 7.22% runtime.newm1
|
||||||
|
0.09s 0.1% 71.44% 4.75s 5.37% runtime.startlockedm
|
||||||
|
2.11s 2.39% 73.82% 4.66s 5.27% runtime.schedtrace
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Raw statements and fmt.Sprintf
|
||||||
|
|
||||||
|
Plain awful
|
||||||
|
|
||||||
|
### Just one transaction
|
||||||
|
|
||||||
|
```
|
||||||
|
❯ time ./twitter-sqlite sample-twitter_rv.net
|
||||||
|
./sqlite-twitter sample-twitter_rv.net 67.94s user 20.34s system 129% cpu 1:08.10 total
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
❯ go tool pprof sqlite-twitter tx
|
||||||
|
File: sqlite-twitter
|
||||||
|
Build ID: 7ec752e835de12b94418fffb45515e1b0f89e89f
|
||||||
|
Type: cpu
|
||||||
|
Time: Aug 24, 2018 at 4:57pm (CEST)
|
||||||
|
Duration: 1.25mins, Total samples = 1.46mins (117.52%)
|
||||||
|
Entering interactive mode (type "help" for commands, "o" for options)
|
||||||
|
(pprof) cum
|
||||||
|
(pprof) top20
|
||||||
|
Showing nodes accounting for 64.61s, 73.59% of 87.80s total
|
||||||
|
Dropped 207 nodes (cum <= 0.44s)
|
||||||
|
Showing top 20 nodes out of 70
|
||||||
|
flat flat% sum% cum cum%
|
||||||
|
0.08s 0.091% 0.091% 55.82s 63.58% main.updateStatus
|
||||||
|
0.49s 0.56% 0.65% 51.68s 58.86% database/sql.(*Tx).StmtContext
|
||||||
|
0.12s 0.14% 0.79% 46.56s 53.03% database/sql.asString
|
||||||
|
0.11s 0.13% 0.91% 45.52s 51.85% github.com/mattn/go-sqlite3.(*SQLiteConn).Prepare
|
||||||
|
41.12s 46.83% 47.74% 41.97s 47.80% runtime.c128hash
|
||||||
|
0.01s 0.011% 47.76% 38.64s 44.01% github.com/mattn/go-sqlite3.(*SQLiteConn).lastError.func3
|
||||||
|
0.05s 0.057% 47.81% 38.64s 44.01% net/url.Values.Encode
|
||||||
|
0.38s 0.43% 48.25% 25.40s 28.93% runtime.findrunnable
|
||||||
|
0.04s 0.046% 48.29% 25.25s 28.76% runtime.casgstatus.func3
|
||||||
|
0.51s 0.58% 48.87% 17.72s 20.18% runtime.forEachP
|
||||||
|
0.30s 0.34% 49.21% 15.93s 18.14% internal/poll.runtime_pollSetDeadline
|
||||||
|
15.61s 17.78% 66.99% 15.61s 17.78% runtime.duffcopy
|
||||||
|
0.46s 0.52% 67.52% 11.50s 13.10% runtime.startm
|
||||||
|
0.15s 0.17% 67.69% 9.12s 10.39% runtime.panicdottypeI
|
||||||
|
1.71s 1.95% 69.64% 7.39s 8.42% runtime.newm1
|
||||||
|
0.06s 0.068% 69.70% 7.34s 8.36% runtime.needm
|
||||||
|
0.36s 0.41% 70.11% 7.28s 8.29% runtime.panicdottypeE
|
||||||
|
0.45s 0.51% 70.63% 5.93s 6.75% github.com/mattn/go-sqlite3.(*SQLiteDriver).Open
|
||||||
|
2.48s 2.82% 73.45% 5.72s 6.51% runtime.schedtrace
|
||||||
|
0.12s 0.14% 73.59% 4.77s 5.43% runtime.startlockedm
|
||||||
|
```
|
||||||
|
## CLI
|
||||||
|
|
||||||
|
### Indexing first
|
||||||
|
|
||||||
|
```
|
||||||
|
❯ time sh sqlite.sh sample-twitter_rv.net
|
||||||
|
sh sqlite.sh sample-twitter_rv.net 25.18s user 6.67s system 91% cpu 34.900 total
|
||||||
|
```
|
||||||
|
|
||||||
|
### Indexing afterwards
|
||||||
|
|
||||||
|
```
|
||||||
|
❯ time sh sqlite.sh sample-twitter_rv.net
|
||||||
|
sh sqlite.sh sample-twitter_rv.net 14.91s user 1.30s system 84% cpu 19.279 total
|
||||||
|
```
|
||||||
|
|
||||||
|
# Comments
|
||||||
|
|
||||||
|
There are way too many knobs to fiddle with, and I know very little about sqlite or SQL in general.
|
||||||
|
This is a very specific use-case, and I've tried to tune the settings accordingly.
|
||||||
|
|
||||||
|
Python was the easiest one to try.
|
||||||
|
It is the language I'm more familiar with, and sqlite3 is included in the standard library, so only this file is needed.
|
||||||
|
|
||||||
|
In Go, I tried compiling with `go build` in my machine and copying the binary to a remote host.
|
||||||
|
I couldn't run it, apparently due to a mismatched glibc version or LDPATH.
|
||||||
|
Instead, I had to use: `go build -ldflags "-linkmode external -extldflags -static" . `.
|
||||||
|
It raises a warning, but I had no issue in my tests.
|
||||||
|
|
||||||
|
In the end, the sqlite command line was the fastest of the three, and very easy to set up.
|
||||||
|
|
||||||
|
If the file you are working with is sorted and without duplicates, the best option is to create the indexes after all the data has been loaded.
|
||||||
|
You will also have to start over if the import fails or is interrupted.
|
||||||
|
It is harder to remove duplicates afterwards in such a big dataset, and the cleanest solution is to simply copy all the unique entries to a new table and delete the old one.
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user