diff --git a/download.py b/download.py new file mode 100644 index 0000000..914de1a --- /dev/null +++ b/download.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python -B + +from github import Github # pip install PyGithub +import json +import os +from pathlib import Path + +all = [] + +g = Github(open(os.path.expanduser(".gist")).read().strip()) + +repos = Path("repos") + +for gist in g.get_user().get_gists(): + if not gist.public: + print(f"Not cloning secret gist: {gist.id} - {gist.description}") + continue + + path = repos / gist.id + if os.path.exists(path): + print(f"Repository exists already: {path}") + continue + + all.append({ + "id" : gist.id, + "description" : gist.description, + "public" : gist.public, + "clone" : gist.git_pull_url, + "updated" : gist.updated_at.isoformat(), + "url" : gist.url, + }) + + # yuck + os.system(f"git remote add '{gist.id}' '{gist.git_pull_url}'") + cmd = f"git subtree add --prefix '{path}' '{gist.id}' master" + print(f'Running {cmd}') + os.system(cmd) + #os.system(f"git submodule add '{0}' repos/{1}".format(gist.git_pull_url, gist.id)) + #import pdb;pdb.set_trace() + + +with open("index.json", "w") as f: + f.write(json.dumps(all, indent=4) + "\n") diff --git a/repos/d08fb2dd1b01cb8b46ac/genera.py b/repos/'generation of PC members for ESA website'/genera.py similarity index 100% rename from repos/d08fb2dd1b01cb8b46ac/genera.py rename to repos/'generation of PC members for ESA website'/genera.py diff --git a/repos/da5d5b45c894159b2f39/instrucciones.md b/repos/'selenium automatic doodle submission'/instrucciones.md similarity index 100% rename from repos/da5d5b45c894159b2f39/instrucciones.md rename to repos/'selenium automatic doodle submission'/instrucciones.md diff --git a/repos/da5d5b45c894159b2f39/troll_doodle.py b/repos/'selenium automatic doodle submission'/troll_mapal.py similarity index 100% rename from repos/da5d5b45c894159b2f39/troll_doodle.py rename to repos/'selenium automatic doodle submission'/troll_mapal.py diff --git a/repos/196bf8a0547b679c6ee23532bd94dabb/glob.py b/repos/'semeval process files'/glob.py similarity index 100% rename from repos/196bf8a0547b679c6ee23532bd94dabb/glob.py rename to repos/'semeval process files'/glob.py diff --git a/repos/196bf8a0547b679c6ee23532bd94dabb/oswalk.py b/repos/'semeval process files'/oswalk.py similarity index 100% rename from repos/196bf8a0547b679c6ee23532bd94dabb/oswalk.py rename to repos/'semeval process files'/oswalk.py diff --git a/repos/0aa0322c7ca044d099aa/*scratch*.el b/repos/0aa0322c7ca044d099aa/*scratch*.el deleted file mode 100644 index 16ce3f1..0000000 --- a/repos/0aa0322c7ca044d099aa/*scratch*.el +++ /dev/null @@ -1,2 +0,0 @@ -SCL - D1 -SDA - D2 \ No newline at end of file diff --git a/repos/0cc414e26db975fe02edfe99d1768d47/config_300.json b/repos/0cc414e26db975fe02edfe99d1768d47/config_300.json deleted file mode 100644 index cec97fb..0000000 --- a/repos/0cc414e26db975fe02edfe99d1768d47/config_300.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "embeddings": [ - { - "tensorName": "300 seeds", - "tensorShape": [ - 1975, - 100 - ], - "tensorPath": "https://lab.gsi.upm.es/oaraque/incel-embeddings/raw/master/300_seeds/neologisms_embeddings_2019-06-07_16-29.tsv", - "metadataPath": "https://lab.gsi.upm.es/oaraque/incel-embeddings/raw/master/300_seeds/neologisms_embeddings_words_2019-06-07_16-29.txt" - } - ] -} diff --git a/repos/4371655/mozpluggerrc b/repos/4371655/mozpluggerrc deleted file mode 100644 index da631b5..0000000 --- a/repos/4371655/mozpluggerrc +++ /dev/null @@ -1,7 +0,0 @@ -# Replace define(ACROREAD, [repeat swallow(acroread) fill : -# acroread -openInNewWindow /a "$fragment" "$file"]) -# -# With the following in /etc/mozpluggerrc to have evince as -# an embedded pdf viewer in chromium: - -define(ACROREAD, [repeat swallow(evince) fill needs_xembed : evince -f "$file"]) \ No newline at end of file diff --git a/repos/95fe96a85478ec7afbed/id_rsa.pub b/repos/95fe96a85478ec7afbed/id_rsa.pub deleted file mode 100644 index 082563d..0000000 --- a/repos/95fe96a85478ec7afbed/id_rsa.pub +++ /dev/null @@ -1 +0,0 @@ -ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDZM2cshLida17Ay7EZW6jB9x3jkczT3QF7XJThPmruhv3E3V50mcT6oXf7eunks1XOJtBBJfxYnB93aGt/FJ/f8n44PF6uL+cuj8L9G1tbcIJXUY4HXDN/ewLN/yJqjqax9Sf/d/RV6i3AC+lD8neUsEk7xTzDNaygv+nz40vZ52aHda5AS1hh7XsVQ8SG/2tLvOGSzLT5bhOxqEBVt1CdaBM9dPZhNG8QnMUgEajywFP8OY13q6+boPGpHe2qBKAmIvN6J2uLhcQ6tIt6ODRGyklNY6B8Cagmf0/tCNg/RfzMAWMLFl1NBn3YCszDj/BR32Nw2pSaBzKdAsIyvmfd jfernando@dit.upm.es diff --git a/repos/92d6b8e9f88d37f57c58bdc683b2636d/Accessing Sentiment Analysis services.ipynb b/repos/Accessing Sentiment Analysis services/Accessing Sentiment Analysis services.ipynb similarity index 100% rename from repos/92d6b8e9f88d37f57c58bdc683b2636d/Accessing Sentiment Analysis services.ipynb rename to repos/Accessing Sentiment Analysis services/Accessing Sentiment Analysis services.ipynb diff --git a/repos/ddbd2be15e20eb81de1a/serializable.py b/repos/Add properties to serializable/serializable.py similarity index 100% rename from repos/ddbd2be15e20eb81de1a/serializable.py rename to repos/Add properties to serializable/serializable.py diff --git a/repos/0c268a23410464a26873/adduser.sh b/repos/Add user to Dokku/adduser.sh similarity index 100% rename from repos/0c268a23410464a26873/adduser.sh rename to repos/Add user to Dokku/adduser.sh diff --git a/repos/a6e97dcbb3b4bc295b64bc2d6325de94/jorge.py b/repos/Analyze tweets with the senpy client/jorge.py similarity index 100% rename from repos/a6e97dcbb3b4bc295b64bc2d6325de94/jorge.py rename to repos/Analyze tweets with the senpy client/jorge.py diff --git a/repos/25ed1a7b291a47399ae5/htaccess b/repos/Apache tricks/htaccess similarity index 100% rename from repos/25ed1a7b291a47399ae5/htaccess rename to repos/Apache tricks/htaccess diff --git a/repos/f011e9801c465378874c/guake.lua b/repos/Awesome/guake.lua similarity index 100% rename from repos/f011e9801c465378874c/guake.lua rename to repos/Awesome/guake.lua diff --git a/repos/f011e9801c465378874c/rc.lua b/repos/Awesome/rc.lua similarity index 100% rename from repos/f011e9801c465378874c/rc.lua rename to repos/Awesome/rc.lua diff --git a/repos/6263617/gistfile1.html b/repos/Badge Mr. X/gistfile1.html similarity index 100% rename from repos/6263617/gistfile1.html rename to repos/Badge Mr. X/gistfile1.html diff --git a/repos/f84238719ac6111d7a1a/checkdic.py b/repos/Check if key and value from template are present in indic/checkdic.py similarity index 100% rename from repos/f84238719ac6111d7a1a/checkdic.py rename to repos/Check if key and value from template are present in indic/checkdic.py diff --git a/repos/3cfb60263b6841cfbeaa/Check with rdflib.sh b/repos/Check with rdflib/Check with rdflib.sh similarity index 100% rename from repos/3cfb60263b6841cfbeaa/Check with rdflib.sh rename to repos/Check with rdflib/Check with rdflib.sh diff --git a/repos/Conectar Deckard/gistfile1 b/repos/Conectar Deckard/gistfile1 new file mode 100644 index 0000000..28f4d7a --- /dev/null +++ b/repos/Conectar Deckard/gistfile1 @@ -0,0 +1 @@ +ssh -p 50022 root@deckard.eestec.net \ No newline at end of file diff --git a/repos/f84c1f107f95103292ddec72c18f65f3/amazon_yeelight.py b/repos/Connect dash button to yeelight/amazon_yeelight.py similarity index 100% rename from repos/f84c1f107f95103292ddec72c18f65f3/amazon_yeelight.py rename to repos/Connect dash button to yeelight/amazon_yeelight.py diff --git a/repos/25131ca85f4f86c8d57c/copyMongo.js b/repos/Copy one collection from a database to another in the same MongoDB/copyMongo.js similarity index 100% rename from repos/25131ca85f4f86c8d57c/copyMongo.js rename to repos/Copy one collection from a database to another in the same MongoDB/copyMongo.js diff --git a/repos/3933798/gistfile1 b/repos/Create an ogv bumper for a video/gistfile1 similarity index 100% rename from repos/3933798/gistfile1 rename to repos/Create an ogv bumper for a video/gistfile1 diff --git a/repos/d63a79aaca0b3a8cddea/dns.py b/repos/DNS server that adds a record for every running docker container/dns.py similarity index 100% rename from repos/d63a79aaca0b3a8cddea/dns.py rename to repos/DNS server that adds a record for every running docker container/dns.py diff --git a/repos/004008aea84ab19b153b4cecd40e1461/Demo.ipynb b/repos/Demo gsitk/Demo.ipynb similarity index 100% rename from repos/004008aea84ab19b153b4cecd40e1461/Demo.ipynb rename to repos/Demo gsitk/Demo.ipynb diff --git a/repos/c231172db37d5393f722/Dockerfile b/repos/Deploying a static website in heroku without extra plugins/Dockerfile similarity index 100% rename from repos/c231172db37d5393f722/Dockerfile rename to repos/Deploying a static website in heroku without extra plugins/Dockerfile diff --git a/repos/c231172db37d5393f722/README.md b/repos/Deploying a static website in heroku without extra plugins/README.md similarity index 100% rename from repos/c231172db37d5393f722/README.md rename to repos/Deploying a static website in heroku without extra plugins/README.md diff --git a/repos/fe50181db5cd40f13ef70fbf33c7ddb5/descarga.ipynb b/repos/Descarga con bitter/descarga.ipynb similarity index 100% rename from repos/fe50181db5cd40f13ef70fbf33c7ddb5/descarga.ipynb rename to repos/Descarga con bitter/descarga.ipynb diff --git a/repos/00908058842bdbc0ea2c/DevDockerfile b/repos/DevDockerfile/DevDockerfile similarity index 100% rename from repos/00908058842bdbc0ea2c/DevDockerfile rename to repos/DevDockerfile/DevDockerfile diff --git a/repos/0d56ab9b79bfa7f2ab06/Emotion Analysis with DepecheMood-Copy1.ipynb b/repos/Emotion Analysis with DepecheMood-Copy1/Emotion Analysis with DepecheMood-Copy1.ipynb similarity index 100% rename from repos/0d56ab9b79bfa7f2ab06/Emotion Analysis with DepecheMood-Copy1.ipynb rename to repos/Emotion Analysis with DepecheMood-Copy1/Emotion Analysis with DepecheMood-Copy1.ipynb diff --git a/repos/09f0d8b86a6877f58b8e/rpi.sh b/repos/Emulate RPI QEmu/rpi.sh similarity index 100% rename from repos/09f0d8b86a6877f58b8e/rpi.sh rename to repos/Emulate RPI QEmu/rpi.sh diff --git a/repos/f17db63300ee88471ed9/Error SQL b/repos/Error SQL/Error SQL similarity index 100% rename from repos/f17db63300ee88471ed9/Error SQL rename to repos/Error SQL/Error SQL diff --git a/repos/6b6906b97267a0c5ac52/coding.py b/repos/Flask server that returns GET parameters and values /coding.py similarity index 100% rename from repos/6b6906b97267a0c5ac52/coding.py rename to repos/Flask server that returns GET parameters and values /coding.py diff --git a/repos/6b6906b97267a0c5ac52/server.py b/repos/Flask server that returns GET parameters and values /server.py similarity index 100% rename from repos/6b6906b97267a0c5ac52/server.py rename to repos/Flask server that returns GET parameters and values /server.py diff --git a/repos/96be6f4e94ae8a7645bc/upload.sh b/repos/Fuseki commands/upload.sh similarity index 100% rename from repos/96be6f4e94ae8a7645bc/upload.sh rename to repos/Fuseki commands/upload.sh diff --git a/repos/1462976b537a1d661e1d/kemsirve.py b/repos/Get a notification when a kemsirve kimsufi server is available/kemsirve.py similarity index 100% rename from repos/1462976b537a1d661e1d/kemsirve.py rename to repos/Get a notification when a kemsirve kimsufi server is available/kemsirve.py diff --git a/repos/1462976b537a1d661e1d/watch.sh b/repos/Get a notification when a kemsirve kimsufi server is available/watch.sh similarity index 100% rename from repos/1462976b537a1d661e1d/watch.sh rename to repos/Get a notification when a kemsirve kimsufi server is available/watch.sh diff --git a/repos/59fd9019f415fe8dd8f802c456be2b13/pybossa.py b/repos/Get the number of answers by pybossa users/pybossa.py similarity index 100% rename from repos/59fd9019f415fe8dd8f802c456be2b13/pybossa.py rename to repos/Get the number of answers by pybossa users/pybossa.py diff --git a/repos/c213d8dc550c7d5b25d6/example.py b/repos/Get twitter credentials with Flask/example.py similarity index 100% rename from repos/c213d8dc550c7d5b25d6/example.py rename to repos/Get twitter credentials with Flask/example.py diff --git a/repos/b8154c52bde270d34718/setup.py b/repos/Load the requirements from requirements.txt into your setup.py/setup.py similarity index 100% rename from repos/b8154c52bde270d34718/setup.py rename to repos/Load the requirements from requirements.txt into your setup.py/setup.py diff --git a/repos/dfd1fbc0c7e65e5e7634/gistfile1.sparql b/repos/Look for demonyms in dbpedia/gistfile1.sparql similarity index 100% rename from repos/dfd1fbc0c7e65e5e7634/gistfile1.sparql rename to repos/Look for demonyms in dbpedia/gistfile1.sparql diff --git a/repos/dfd1fbc0c7e65e5e7634/gistfile2.sparql b/repos/Look for demonyms in dbpedia/gistfile2.sparql similarity index 100% rename from repos/dfd1fbc0c7e65e5e7634/gistfile2.sparql rename to repos/Look for demonyms in dbpedia/gistfile2.sparql diff --git a/repos/e654835d4263378a3ac8/custom_palette.py b/repos/Matplotlib tricks/custom_palette.py similarity index 100% rename from repos/e654835d4263378a3ac8/custom_palette.py rename to repos/Matplotlib tricks/custom_palette.py diff --git a/repos/e654835d4263378a3ac8/customlabels.py b/repos/Matplotlib tricks/customlabels.py similarity index 100% rename from repos/e654835d4263378a3ac8/customlabels.py rename to repos/Matplotlib tricks/customlabels.py diff --git a/repos/b9c4f5cec28ba28af9e186194a63c0b4/coalition.py b/repos/Modelos Soil de prueba/coalition.py similarity index 100% rename from repos/b9c4f5cec28ba28af9e186194a63c0b4/coalition.py rename to repos/Modelos Soil de prueba/coalition.py diff --git a/repos/b9c4f5cec28ba28af9e186194a63c0b4/coalition.yml b/repos/Modelos Soil de prueba/coalition.yml similarity index 100% rename from repos/b9c4f5cec28ba28af9e186194a63c0b4/coalition.yml rename to repos/Modelos Soil de prueba/coalition.yml diff --git a/repos/887c5b163f5359a6a13a7858e7f484d4/prueba.py b/repos/Modelos Soil de prueba/prueba.py similarity index 100% rename from repos/887c5b163f5359a6a13a7858e7f484d4/prueba.py rename to repos/Modelos Soil de prueba/prueba.py diff --git a/repos/33835d7be3075c023aad/modules b/repos/Modules for TFT 3.2 Raspberry Pi/modules similarity index 100% rename from repos/33835d7be3075c023aad/modules rename to repos/Modules for TFT 3.2 Raspberry Pi/modules diff --git a/repos/9941322/migueljsonld.ipynb b/repos/Notebook con framing de jsonld/migueljsonld.ipynb similarity index 100% rename from repos/9941322/migueljsonld.ipynb rename to repos/Notebook con framing de jsonld/migueljsonld.ipynb diff --git a/repos/ff8baa5fea0b20372453/progres.py b/repos/Print progress in python/progres.py similarity index 100% rename from repos/ff8baa5fea0b20372453/progres.py rename to repos/Print progress in python/progres.py diff --git a/repos/1e1452d5ae6d0268c651/MarksMan b/repos/Prueba de bot para IronHack/MarksMan similarity index 100% rename from repos/1e1452d5ae6d0268c651/MarksMan rename to repos/Prueba de bot para IronHack/MarksMan diff --git a/repos/1b3cb1d48c75d2172d88/tweeply.py b/repos/Python property that mirrors the content of a dictionary. To convenienty add to a subclass of dict and then do: my_object.my_property/tweeply.py similarity index 100% rename from repos/1b3cb1d48c75d2172d88/tweeply.py rename to repos/Python property that mirrors the content of a dictionary. To convenienty add to a subclass of dict and then do: my_object.my_property/tweeply.py diff --git a/repos/bb677c23be8fced89ea14a1f098bdb10/example.tex b/repos/Rotated headers in latex tables/example.tex similarity index 100% rename from repos/bb677c23be8fced89ea14a1f098bdb10/example.tex rename to repos/Rotated headers in latex tables/example.tex diff --git a/repos/cdb3356ed7413637a3334dd0b27da13b/4_4.ipynb b/repos/SITC Classification/4_4.ipynb similarity index 100% rename from repos/cdb3356ed7413637a3334dd0b27da13b/4_4.ipynb rename to repos/SITC Classification/4_4.ipynb diff --git a/repos/cdb3356ed7413637a3334dd0b27da13b/README.md b/repos/SITC Classification/README.md similarity index 100% rename from repos/cdb3356ed7413637a3334dd0b27da13b/README.md rename to repos/SITC Classification/README.md diff --git a/repos/4322258/tar-and-scp.sh b/repos/Script to copy all the logs with a certain filename (YYY-MM-DD_.log) to a folder for the day, tar.gz the folder and send it to a remote host./tar-and-scp.sh similarity index 100% rename from repos/4322258/tar-and-scp.sh rename to repos/Script to copy all the logs with a certain filename (YYY-MM-DD_.log) to a folder for the day, tar.gz the folder and send it to a remote host./tar-and-scp.sh diff --git a/repos/db891b8203df37e5e9e22245820d29a6/set-default-browser.sh b/repos/Set default browser/set-default-browser.sh similarity index 100% rename from repos/db891b8203df37e5e9e22245820d29a6/set-default-browser.sh rename to repos/Set default browser/set-default-browser.sh diff --git a/repos/f375e47338f554523e6380954cba0d87/gistfile1.txt b/repos/Simple script to control octoprint/gistfile1.txt similarity index 100% rename from repos/f375e47338f554523e6380954cba0d87/gistfile1.txt rename to repos/Simple script to control octoprint/gistfile1.txt diff --git a/repos/f375e47338f554523e6380954cba0d87/octo.sh b/repos/Simple script to control octoprint/octo.sh similarity index 100% rename from repos/f375e47338f554523e6380954cba0d87/octo.sh rename to repos/Simple script to control octoprint/octo.sh diff --git a/repos/a5894ae26437006b8efb/Makefile b/repos/Some automation to submit to a journal (e.g. Elsevier). It detects the main latex file and eps files used. /Makefile similarity index 100% rename from repos/a5894ae26437006b8efb/Makefile rename to repos/Some automation to submit to a journal (e.g. Elsevier). It detects the main latex file and eps files used. /Makefile diff --git a/repos/a8115622662239a267893743f10e3918/README.md b/repos/Test nslookup different k8s namespaces/README.md similarity index 100% rename from repos/a8115622662239a267893743f10e3918/README.md rename to repos/Test nslookup different k8s namespaces/README.md diff --git a/repos/a8115622662239a267893743f10e3918/gistfile1.txt b/repos/Test nslookup different k8s namespaces/gistfile1.txt similarity index 100% rename from repos/a8115622662239a267893743f10e3918/gistfile1.txt rename to repos/Test nslookup different k8s namespaces/gistfile1.txt diff --git a/repos/a8115622662239a267893743f10e3918/pruebans1.yml b/repos/Test nslookup different k8s namespaces/pruebans1.yml similarity index 100% rename from repos/a8115622662239a267893743f10e3918/pruebans1.yml rename to repos/Test nslookup different k8s namespaces/pruebans1.yml diff --git a/repos/cbe892fe3322a382765a/ontology.sh b/repos/Upload ontologies to GSI server/ontology.sh similarity index 100% rename from repos/cbe892fe3322a382765a/ontology.sh rename to repos/Upload ontologies to GSI server/ontology.sh diff --git a/repos/5678622/OpenCalais.sh b/repos/Use OpenCalais API/OpenCalais.sh similarity index 100% rename from repos/5678622/OpenCalais.sh rename to repos/Use OpenCalais API/OpenCalais.sh diff --git a/repos/fb9e68c7ec482d136c77/Git magic.md b/repos/Useful commands for git/Git magic.md similarity index 100% rename from repos/fb9e68c7ec482d136c77/Git magic.md rename to repos/Useful commands for git/Git magic.md diff --git a/repos/6037297/ValidateOWL.sh b/repos/ValidateOWL/ValidateOWL.sh similarity index 100% rename from repos/6037297/ValidateOWL.sh rename to repos/ValidateOWL/ValidateOWL.sh diff --git a/repos/015554b095b5525e26489c4db33e2270/letsencrypt.yml b/repos/ansible letsencrypt/letsencrypt.yml similarity index 100% rename from repos/015554b095b5525e26489c4db33e2270/letsencrypt.yml rename to repos/ansible letsencrypt/letsencrypt.yml diff --git a/repos/e4b375171b20aea1fdc4/api_swagger.yaml b/repos/api_swagger/api_swagger.yaml similarity index 100% rename from repos/e4b375171b20aea1fdc4/api_swagger.yaml rename to repos/api_swagger/api_swagger.yaml diff --git a/repos/comparison/comparison.md b/repos/comparison/comparison.md new file mode 100644 index 0000000..b0964b6 --- /dev/null +++ b/repos/comparison/comparison.md @@ -0,0 +1,170 @@ +This is a quick comparison of three different approaches to load a big CSV/TSV file into a sqlite database. + +TL;DR sqlite has an `.import` function that works wonders. + +# Results +## Python + +``` + ❯ time python2 convert.py sample-twitter_rv.net +Done: 9999999 lines. 131128444 / 131128444 bytes (100.0 %) +python2 convert.py sample-twitter_rv.net 63.96s user 27.51s system 63% cpu 2:23.53 total +``` + +This is the output of cProfile: +``` +python2 -m cProfile convert.py sample-twitter_rv.net +Done: 9999999 lines. 131128444 / 131128444 bytes (100.0 %) + 50006080 function calls (50006073 primitive calls) in 241.581 seconds + + Ordered by: cumulative time + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 241.581 241.581 convert.py:1() + 1 12.686 12.686 241.576 241.576 convert.py:18(main) + 10000000 5.114 0.000 157.256 0.000 convert.py:7(addusers) + 10000004 152.164 0.000 152.164 0.000 {method 'execute' of 'sqlite3.Connection' objects} + 1001 66.972 0.067 66.972 0.067 {method 'commit' of 'sqlite3.Connection' objects} + 10000002 2.471 0.000 2.471 0.000 {method 'split' of 'str' objects} + 10000000 1.103 0.000 1.103 0.000 {method 'strip' of 'str' objects} + 10000000 1.015 0.000 1.015 0.000 {len} + 1001 0.032 0.000 0.046 0.000 convert.py:12(update_progress) + 1001 0.011 0.000 0.011 0.000 {method 'format' of 'str' objects} + 1000 0.005 0.000 0.005 0.000 {method 'tell' of 'file' objects} + 1 0.000 0.000 0.005 0.005 __init__.py:24() + 1 0.002 0.002 0.004 0.004 dbapi2.py:24() + 2002 0.003 0.000 0.003 0.000 {method 'write' of 'file' objects} + 1 0.001 0.001 0.002 0.002 collections.py:11() +``` + +## Golang + + +### With Transactions + +``` + ❯ time ./twitter-sqlite sample-twitter_rv.net + 131128444/131128444 Bytes (100.000%) 10000000 lines - 0.00 Bps (avg. 1656955.51 Bps) +./sqlite-twitter sample-twitter_rv.net 71.16s user 24.15s system 120% cpu 1:19.15 total +``` + +Using the pprof module, I could also extract some profiling information: + +``` + ❯ go tool pprof sqlite-twitter tx +File: sqlite-twitter +Build ID: 708e90eba7948cb0851dfbf3bb6170ccaa418eff +Type: cpu +Time: Aug 24, 2018 at 4:26pm (CEST) +Duration: 1.26mins, Total samples = 1.47mins (117.03%) +Entering interactive mode (type "help" for commands, "o" for options) +(pprof) cum +(pprof) top20 +Showing nodes accounting for 65.26s, 73.82% of 88.40s total +Dropped 222 nodes (cum <= 0.44s) +Showing top 20 nodes out of 70 + flat flat% sum% cum cum% + 0.09s 0.1% 0.1% 58.96s 66.70% main.main + 0.44s 0.5% 0.6% 53.39s 60.40% database/sql.(*Tx).StmtContext + 0.11s 0.12% 0.72% 48.59s 54.97% database/sql.asString + 0.06s 0.068% 0.79% 47.55s 53.79% github.com/mattn/go-sqlite3.(*SQLiteConn).Prepare + 43.32s 49.00% 49.80% 44.27s 50.08% runtime.c128hash + 0.02s 0.023% 49.82% 40.16s 45.43% net/url.Values.Encode + 0.02s 0.023% 49.84% 39.79s 45.01% github.com/mattn/go-sqlite3.(*SQLiteConn).lastError.func3 + 0.24s 0.27% 50.11% 23.36s 26.43% runtime.findrunnable + 0.03s 0.034% 50.15% 23.24s 26.29% runtime.casgstatus.func3 + 0.44s 0.5% 50.64% 16.84s 19.05% runtime.forEachP + 0.17s 0.19% 50.84% 15.17s 17.16% internal/poll.runtime_pollSetDeadline + 14.98s 16.95% 67.78% 14.98s 16.95% runtime.duffcopy + 0.41s 0.46% 68.25% 10.67s 12.07% runtime.startm + 0.17s 0.19% 68.44% 8.62s 9.75% runtime.panicdottypeI + 0.11s 0.12% 68.56% 7.25s 8.20% runtime.needm + 0.31s 0.35% 68.91% 6.97s 7.88% runtime.panicdottypeE + 0.33s 0.37% 69.29% 6.90s 7.81% github.com/mattn/go-sqlite3.(*SQLiteDriver).Open + 1.81s 2.05% 71.33% 6.38s 7.22% runtime.newm1 + 0.09s 0.1% 71.44% 4.75s 5.37% runtime.startlockedm + 2.11s 2.39% 73.82% 4.66s 5.27% runtime.schedtrace + +``` + + +### Raw statements and fmt.Sprintf + +Plain awful + +### Just one transaction + +``` + ❯ time ./twitter-sqlite sample-twitter_rv.net +./sqlite-twitter sample-twitter_rv.net 67.94s user 20.34s system 129% cpu 1:08.10 total +``` + +``` + ❯ go tool pprof sqlite-twitter tx +File: sqlite-twitter +Build ID: 7ec752e835de12b94418fffb45515e1b0f89e89f +Type: cpu +Time: Aug 24, 2018 at 4:57pm (CEST) +Duration: 1.25mins, Total samples = 1.46mins (117.52%) +Entering interactive mode (type "help" for commands, "o" for options) +(pprof) cum +(pprof) top20 +Showing nodes accounting for 64.61s, 73.59% of 87.80s total +Dropped 207 nodes (cum <= 0.44s) +Showing top 20 nodes out of 70 + flat flat% sum% cum cum% + 0.08s 0.091% 0.091% 55.82s 63.58% main.updateStatus + 0.49s 0.56% 0.65% 51.68s 58.86% database/sql.(*Tx).StmtContext + 0.12s 0.14% 0.79% 46.56s 53.03% database/sql.asString + 0.11s 0.13% 0.91% 45.52s 51.85% github.com/mattn/go-sqlite3.(*SQLiteConn).Prepare + 41.12s 46.83% 47.74% 41.97s 47.80% runtime.c128hash + 0.01s 0.011% 47.76% 38.64s 44.01% github.com/mattn/go-sqlite3.(*SQLiteConn).lastError.func3 + 0.05s 0.057% 47.81% 38.64s 44.01% net/url.Values.Encode + 0.38s 0.43% 48.25% 25.40s 28.93% runtime.findrunnable + 0.04s 0.046% 48.29% 25.25s 28.76% runtime.casgstatus.func3 + 0.51s 0.58% 48.87% 17.72s 20.18% runtime.forEachP + 0.30s 0.34% 49.21% 15.93s 18.14% internal/poll.runtime_pollSetDeadline + 15.61s 17.78% 66.99% 15.61s 17.78% runtime.duffcopy + 0.46s 0.52% 67.52% 11.50s 13.10% runtime.startm + 0.15s 0.17% 67.69% 9.12s 10.39% runtime.panicdottypeI + 1.71s 1.95% 69.64% 7.39s 8.42% runtime.newm1 + 0.06s 0.068% 69.70% 7.34s 8.36% runtime.needm + 0.36s 0.41% 70.11% 7.28s 8.29% runtime.panicdottypeE + 0.45s 0.51% 70.63% 5.93s 6.75% github.com/mattn/go-sqlite3.(*SQLiteDriver).Open + 2.48s 2.82% 73.45% 5.72s 6.51% runtime.schedtrace + 0.12s 0.14% 73.59% 4.77s 5.43% runtime.startlockedm +``` +## CLI + +### Indexing first + +``` +❯ time sh sqlite.sh sample-twitter_rv.net +sh sqlite.sh sample-twitter_rv.net 25.18s user 6.67s system 91% cpu 34.900 total +``` + +### Indexing afterwards + +``` +❯ time sh sqlite.sh sample-twitter_rv.net +sh sqlite.sh sample-twitter_rv.net 14.91s user 1.30s system 84% cpu 19.279 total +``` + +# Comments + +There are way too many knobs to fiddle with, and I know very little about sqlite or SQL in general. +This is a very specific use-case, and I've tried to tune the settings accordingly. + +Python was the easiest one to try. +It is the language I'm more familiar with, and sqlite3 is included in the standard library, so only this file is needed. + +In Go, I tried compiling with `go build` in my machine and copying the binary to a remote host. +I couldn't run it, apparently due to a mismatched glibc version or LDPATH. +Instead, I had to use: `go build -ldflags "-linkmode external -extldflags -static" . `. +It raises a warning, but I had no issue in my tests. + +In the end, the sqlite command line was the fastest of the three, and very easy to set up. + +If the file you are working with is sorted and without duplicates, the best option is to create the indexes after all the data has been loaded. +You will also have to start over if the import fails or is interrupted. +It is harder to remove duplicates afterwards in such a big dataset, and the cleanest solution is to simply copy all the unique entries to a new table and delete the old one. \ No newline at end of file diff --git a/repos/9207706/context.jsonld b/repos/context/context.jsonld similarity index 100% rename from repos/9207706/context.jsonld rename to repos/context/context.jsonld diff --git a/repos/05a0a14deacbf96258ac/convert b/repos/convert json to jsonb in postgres/convert similarity index 100% rename from repos/05a0a14deacbf96258ac/convert rename to repos/convert json to jsonb in postgres/convert diff --git a/repos/7319938/djangosubclass.py b/repos/djangosubclass/djangosubclass.py similarity index 100% rename from repos/7319938/djangosubclass.py rename to repos/djangosubclass/djangosubclass.py diff --git a/repos/fb995f5ddf394bea714e/emote.php b/repos/emote/emote.php similarity index 100% rename from repos/fb995f5ddf394bea714e/emote.php rename to repos/emote/emote.php diff --git a/repos/b21443fe1d57fdfe0fe2f79827be3551/hostname.py b/repos/get hostname from ifconfig.co/hostname.py similarity index 100% rename from repos/b21443fe1d57fdfe0fe2f79827be3551/hostname.py rename to repos/get hostname from ifconfig.co/hostname.py diff --git a/repos/6113845/group_pictures_UPM.py b/repos/group_pictures_UPM/group_pictures_UPM.py similarity index 100% rename from repos/6113845/group_pictures_UPM.py rename to repos/group_pictures_UPM/group_pictures_UPM.py diff --git a/repos/e05d7157733983b56869/haproxy.cfg b/repos/haproxy/haproxy.cfg similarity index 100% rename from repos/e05d7157733983b56869/haproxy.cfg rename to repos/haproxy/haproxy.cfg diff --git a/repos/410cbae61e1acba67fc3/id_rsa.pub b/repos/id_rsa/id_rsa.pub similarity index 100% rename from repos/410cbae61e1acba67fc3/id_rsa.pub rename to repos/id_rsa/id_rsa.pub diff --git a/repos/f19f15be4e3e6a24bbbf/id_rsa.pub b/repos/id_rsa/id_rsa_balkian@sinpapel.es/id_rsa.pub similarity index 100% rename from repos/f19f15be4e3e6a24bbbf/id_rsa.pub rename to repos/id_rsa/id_rsa_balkian@sinpapel.es/id_rsa.pub diff --git a/repos/e0e184f00cd60d9e7b09/id_rsa.pub b/repos/id_rsa/id_rsa_xps/id_rsa.pub similarity index 100% rename from repos/e0e184f00cd60d9e7b09/id_rsa.pub rename to repos/id_rsa/id_rsa_xps/id_rsa.pub diff --git a/repos/2a64d7564414f1949ec6/iftt.py b/repos/iftt/iftt.py similarity index 100% rename from repos/2a64d7564414f1949ec6/iftt.py rename to repos/iftt/iftt.py diff --git a/repos/e0f5e471500f9fc4b890/migrate.py b/repos/migrate mongodb twitter/migrate.py similarity index 100% rename from repos/e0f5e471500f9fc4b890/migrate.py rename to repos/migrate mongodb twitter/migrate.py diff --git a/repos/41c51d1a6f6d4277c6cee5a92b8ba6d1/pandalatex.py b/repos/pandalatex/pandalatex.py similarity index 100% rename from repos/41c51d1a6f6d4277c6cee5a92b8ba6d1/pandalatex.py rename to repos/pandalatex/pandalatex.py diff --git a/repos/9186599/Array combinations b/repos/python array and frozenset combinations/Array combinations similarity index 100% rename from repos/9186599/Array combinations rename to repos/python array and frozenset combinations/Array combinations diff --git a/repos/9186599/Array combinations as sets b/repos/python array and frozenset combinations/Array combinations as sets similarity index 100% rename from repos/9186599/Array combinations as sets rename to repos/python array and frozenset combinations/Array combinations as sets diff --git a/repos/9186870/array_combination.ipynb b/repos/python array and frozenset combinations/array_combination.ipynb similarity index 100% rename from repos/9186870/array_combination.ipynb rename to repos/python array and frozenset combinations/array_combination.ipynb diff --git a/repos/9186870/frozen_combination.py b/repos/python array and frozenset combinations/frozen_combination.py similarity index 100% rename from repos/9186870/frozen_combination.py rename to repos/python array and frozenset combinations/frozen_combination.py diff --git a/repos/2de5983237b66098945374eb79878a1d/recursivedict.py b/repos/recursivedict/recursivedict.py similarity index 100% rename from repos/2de5983237b66098945374eb79878a1d/recursivedict.py rename to repos/recursivedict/recursivedict.py diff --git a/repos/5048738/recursively_add.py b/repos/recursively_add/recursively_add.py similarity index 100% rename from repos/5048738/recursively_add.py rename to repos/recursively_add/recursively_add.py diff --git a/repos/61a2fae8d94ede667f4a/remove_output.py b/repos/remove jupyter output/remove_output.py similarity index 100% rename from repos/61a2fae8d94ede667f4a/remove_output.py rename to repos/remove jupyter output/remove_output.py diff --git a/repos/a383e5b6e980c002bae2/rpi.scad b/repos/rpi/rpi.scad similarity index 100% rename from repos/a383e5b6e980c002bae2/rpi.scad rename to repos/rpi/rpi.scad diff --git a/repos/5a26850fd18725fc17d698fd6d09f1ba/scrape_aliexpress.py b/repos/scrape_aliexpress/scrape_aliexpress.py similarity index 100% rename from repos/5a26850fd18725fc17d698fd6d09f1ba/scrape_aliexpress.py rename to repos/scrape_aliexpress/scrape_aliexpress.py diff --git a/repos/90c0497c0b207812cbbc/specgen.sh b/repos/specgen/specgen.sh similarity index 100% rename from repos/90c0497c0b207812cbbc/specgen.sh rename to repos/specgen/specgen.sh diff --git a/repos/90c0497c0b207812cbbc/upload.sh b/repos/specgen/upload.sh similarity index 100% rename from repos/90c0497c0b207812cbbc/upload.sh rename to repos/specgen/upload.sh diff --git a/repos/4ff59e1673a1a754e14f8fd619581cbf/submodules2subtrees.sh b/repos/submodules2subtrees/submodules2subtrees.sh similarity index 100% rename from repos/4ff59e1673a1a754e14f8fd619581cbf/submodules2subtrees.sh rename to repos/submodules2subtrees/submodules2subtrees.sh diff --git a/repos/9056b004cfa3edcb1ef3/ES_MA_NIF b/repos/templates marl generator/ES_MA_NIF similarity index 100% rename from repos/9056b004cfa3edcb1ef3/ES_MA_NIF rename to repos/templates marl generator/ES_MA_NIF diff --git a/repos/9056b004cfa3edcb1ef3/ES_MA_to_Marl b/repos/templates marl generator/ES_MA_to_Marl similarity index 100% rename from repos/9056b004cfa3edcb1ef3/ES_MA_to_Marl rename to repos/templates marl generator/ES_MA_to_Marl diff --git a/repos/9056b004cfa3edcb1ef3/PT_HA_NIF b/repos/templates marl generator/PT_HA_NIF similarity index 100% rename from repos/9056b004cfa3edcb1ef3/PT_HA_NIF rename to repos/templates marl generator/PT_HA_NIF diff --git a/repos/9056b004cfa3edcb1ef3/PT_HA_to_Marl b/repos/templates marl generator/PT_HA_to_Marl similarity index 100% rename from repos/9056b004cfa3edcb1ef3/PT_HA_to_Marl rename to repos/templates marl generator/PT_HA_to_Marl diff --git a/repos/9056b004cfa3edcb1ef3/PT_MA_to_Marl b/repos/templates marl generator/PT_MA_to_Marl similarity index 100% rename from repos/9056b004cfa3edcb1ef3/PT_MA_to_Marl rename to repos/templates marl generator/PT_MA_to_Marl diff --git a/repos/9056b004cfa3edcb1ef3/PT_SA_to_Marl b/repos/templates marl generator/PT_SA_to_Marl similarity index 100% rename from repos/9056b004cfa3edcb1ef3/PT_SA_to_Marl rename to repos/templates marl generator/PT_SA_to_Marl diff --git a/repos/9056b004cfa3edcb1ef3/TripAdvisor.json b/repos/templates marl generator/TripAdvisor.json similarity index 100% rename from repos/9056b004cfa3edcb1ef3/TripAdvisor.json rename to repos/templates marl generator/TripAdvisor.json diff --git a/repos/06c454a48156dccc6f55/template-ES_MA-json-ld b/repos/templates marl generator/template-ES_MA-json-ld similarity index 100% rename from repos/06c454a48156dccc6f55/template-ES_MA-json-ld rename to repos/templates marl generator/template-ES_MA-json-ld diff --git a/repos/06c454a48156dccc6f55/template-ES_MA-nt b/repos/templates marl generator/template-ES_MA-nt similarity index 100% rename from repos/06c454a48156dccc6f55/template-ES_MA-nt rename to repos/templates marl generator/template-ES_MA-nt diff --git a/repos/06c454a48156dccc6f55/template-PT_HA-json-ld b/repos/templates marl generator/template-PT_HA-json-ld similarity index 100% rename from repos/06c454a48156dccc6f55/template-PT_HA-json-ld rename to repos/templates marl generator/template-PT_HA-json-ld diff --git a/repos/06c454a48156dccc6f55/template-PT_HA-nt b/repos/templates marl generator/template-PT_HA-nt similarity index 100% rename from repos/06c454a48156dccc6f55/template-PT_HA-nt rename to repos/templates marl generator/template-PT_HA-nt diff --git a/repos/06c454a48156dccc6f55/template-PT_MA-json-ld b/repos/templates marl generator/template-PT_MA-json-ld similarity index 100% rename from repos/06c454a48156dccc6f55/template-PT_MA-json-ld rename to repos/templates marl generator/template-PT_MA-json-ld diff --git a/repos/06c454a48156dccc6f55/template-PT_MA-nt b/repos/templates marl generator/template-PT_MA-nt similarity index 100% rename from repos/06c454a48156dccc6f55/template-PT_MA-nt rename to repos/templates marl generator/template-PT_MA-nt diff --git a/repos/06c454a48156dccc6f55/template-PT_SA-json-ld b/repos/templates marl generator/template-PT_SA-json-ld similarity index 100% rename from repos/06c454a48156dccc6f55/template-PT_SA-json-ld rename to repos/templates marl generator/template-PT_SA-json-ld diff --git a/repos/06c454a48156dccc6f55/template-PT_SA-nt b/repos/templates marl generator/template-PT_SA-nt similarity index 100% rename from repos/06c454a48156dccc6f55/template-PT_SA-nt rename to repos/templates marl generator/template-PT_SA-nt diff --git a/repos/06c454a48156dccc6f55/template-TrendMiner Sentimerge-json-ld b/repos/templates marl generator/template-TrendMiner Sentimerge-json-ld similarity index 100% rename from repos/06c454a48156dccc6f55/template-TrendMiner Sentimerge-json-ld rename to repos/templates marl generator/template-TrendMiner Sentimerge-json-ld diff --git a/repos/06c454a48156dccc6f55/template-raw-json-ld b/repos/templates marl generator/template-raw-json-ld similarity index 100% rename from repos/06c454a48156dccc6f55/template-raw-json-ld rename to repos/templates marl generator/template-raw-json-ld diff --git a/repos/fc3ede1ad607e8785b80/unified_v2.template b/repos/templates marl generator/unified_v2.template similarity index 100% rename from repos/fc3ede1ad607e8785b80/unified_v2.template rename to repos/templates marl generator/unified_v2.template diff --git a/repos/f2336b4263cc7fa4db91/testdflib.py b/repos/testdflib/testdflib.py similarity index 100% rename from repos/f2336b4263cc7fa4db91/testdflib.py rename to repos/testdflib/testdflib.py diff --git a/repos/e6890355c392e5a99c3b65b1a4030817/tutorial.ipynb b/repos/tutorial/tutorial.ipynb similarity index 100% rename from repos/e6890355c392e5a99c3b65b1a4030817/tutorial.ipynb rename to repos/tutorial/tutorial.ipynb diff --git a/repos/9e13638fcc089ae6d90d/follow.list b/repos/twitter follow list for experiment/follow.list similarity index 100% rename from repos/9e13638fcc089ae6d90d/follow.list rename to repos/twitter follow list for experiment/follow.list