1
0
mirror of https://github.com/balkian/slides-intro-spark.git synced 2024-11-21 11:52:28 +00:00
slides-intro-spark/spark1.tex
2018-10-16 15:42:30 +02:00

1166 lines
28 KiB
TeX

\documentclass{beamer}
\usepackage[utf8]{inputenc}
\usepackage{media9}
\usepackage{lipsum}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usetheme{metropolis}
\usepackage{tcolorbox}
\usepackage{listings}
\usepackage{lstlinebgcolor}
\usepackage{MnSymbol}
\usepackage{wasysym}
\usepackage{animate}
\lstset{%
basicstyle=\ttfamily\large,
columns=fullflexible,
escapeinside={(*@}{@*)},
numbers=none,
breaklines=true,
numbersep=5pt, % how far the line-numbers are from the code
numberstyle=\tiny\color{gray}, % the style that is used for the line-numbers
numbersep=5pt, % how far the line-numbers are from the code
% postbreak={\hbox{\raisebox{0ex}[0ex][0ex]\color{red}{\hookrightarrow}\space}}
postbreak=\raisebox{0ex}[0ex][0ex]{\ensuremath{\rcurvearrowse\space}},
keywordstyle=\color{blue}, % keyword style
stringstyle=\color{red}, % string literal style
language=Scala,
% belowskip=0pt,
% aboveskip=0pt,
}
\definecolor{lightyellow}{RGB}{255,255,204}
\title{Spark}
\subtitle{Cluster computing}
\author{J. Fernando Sánchez, Joaquín Salvachúa, Gabriel Huecas }
\institute{Universidad Politécnica de Madrid}
\date{2016}
\newcommand{\btVFill}{\vskip0pt plus 1filll}
\begin{document}
\begin{frame}
\titlepage{}
\end{frame}
\begin{frame}[allowframebreaks]
\frametitle{Outline}
\tableofcontents
\end{frame}
\section{Background}
\begin{frame}
\frametitle{LISP and functional programming}
\begin{columns}
\column{0.5\textwidth}
\begin{itemize}
\item Higher level programming
\item Avoid side effects
\item Pattern matching
\end{itemize}
\column{0.5\textwidth}
\includegraphics[width=\textwidth]{images/lisplogo.png}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Scala}
\begin{columns}
\column{0.5\textwidth}
\includegraphics[width=\textwidth]{images/scalalogo.png}
\column{0.5\textwidth}
\begin{itemize}
\item A \textit{better} Java
\item Functional programming (optional)
\item Actors for (coarse) concurrency
\end{itemize}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Docker}
\begin{columns}
\column{0.5\textwidth}
\begin{itemize}
\item Easy and repeatable deployment
\item Lots of pre-built images @ hub.docker.com
\item Building block for other tools (swarm, compose, machine...)
\end{itemize}
\column{0.5\textwidth}
\includegraphics[width=\textwidth]{images/docker_logo.png}
\end{columns}
\end{frame}
{
\pagecolor{black}
\usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[height=\paperheight]{images/knowscala.jpg}\hfil}\vfil}}%
\begin{frame}[plain]
\end{frame}
}
{
\pagecolor{black}
\usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[height=\paperheight]{images/knowbigdata.jpg}\hfil}\vfil}}%
\begin{frame}[plain]
\end{frame}
}
% \begin{frame}[plain]
% \center
% \huge{Show me}
% \end{frame}
{
\pagecolor{white}
\usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[width=\paperwidth]{images/showme.png}\hfil}\vfil}}%
\begin{frame}[plain]
\end{frame}
}
\begin{frame}[fragile]
\frametitle{Word count in Wikipedia}
Problem: find the frequency each word is used in Wikipedia.
We have the text of all wikipedia in a text file\footnote{By happy coincidence, the first two lines are ``hi world'' and ``hi''}. It begins like this:
\begin{lstlisting}[backgroundcolor=\color{lightyellow},language={},postbreak={},
breakautoindent=false, breakindent=0pt, breaklines]
hi world
hi
Scala (SKAH-lah)[9] is a general-purpose programming language. Scala has full support for functional programming and a strong static type system. Designed to be concise,[10] many of Scala's design decisions were inspired by criticism of Java's shortcomings.[8]
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\frametitle{Algorithm}
\begin{itemize}
\item Read every line
\item Chunk every line into words
\item Count every occurrence
\item For every word, sum its occurrences
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Possible results of every step}
\begin{lstlisting}[language={},numbers=none,linebackgroundcolor={
\btLstHL<1>{1}%
\btLstHL<2>{2}%
\btLstHL<3>{3,4}%
\btLstHL<4>{5}%
}]
(("hi world"), ("hi") ...)
List((hi, 1), (hi, 1), (world, 1) ...)
Map(hi ->(("hi", 1), (hi, 1)),
world -> ((world, 1)) ...)
Map(hi-> 2, world -> 1 ...)
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\frametitle{Running the scala shell}
We will use docker.
\begin{lstlisting}[language=bash,numbers=none]
docker run -it -v $PWD:Wikipedia.txt:Wiki \
--rm williamyeh/scala
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\frametitle{Scala code}
%[linebackgroundcolor={
% \btLstHL<1>{1} %
% \btLstHL<2>{3} %
% \btLstHL<3>{5} %
% \btLstHL<4>{7} %
% \btLstHL<5>{9} %
% }]
\begin{lstlisting}[language=Scala,linebackgroundcolor={
\btLstHL<1>{2}%
\btLstHL<2>{3}%
\btLstHL<3>{4}%
\btLstHL<4>{5}%
\btLstHL<5>{6,7}%
}]
import scala.io.Source
val wiki = Source.fromFile("Wiki").getLines
wiki.flatMap(line=> line.split(" "))
map(x=>(x, 1)).toList
groupBy(x => x._1)
map({case (k, v) =>
(k, v.foldLeft(0)((a, b) => a+b._2)))
\end{lstlisting}
\end{frame}
\begin{frame}
\center
{\huge Let's run it in the shell.}
\end{frame}
{
\pagecolor{black}
\usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[height=\paperheight]{images/justwaithere.jpg}\hfil}\vfil}}%
\begin{frame}[plain]
\end{frame}
}
\begin{frame}
\frametitle{Wikipedia is big}
\includegraphics[width=\textwidth]{images/outofmemory.jpg}
\end{frame}
\begin{frame}
\center
{\huge What happened?}
\end{frame}
\begin{frame}
\frametitle{Limited resources}
\begin{itemize}
\item CPU limits our speed
\begin{itemize}
\item Multi-cores help...
\item ...but real parallelism is \textbf{hard}
\end{itemize}
\item RAM limits how much data you can process at the same time
\begin{itemize}
\item What if you need more than 128GB?
\item You could use more than one computer...
\item ... but cluster computing is even harder than ``local'' parallelism
\end{itemize}
\item Functional programming helps a bit
\end{itemize}
\end{frame}
\begin{frame}[plain]
\center
\huge But... this was supposed to be fun, wasn't it?
\end{frame}
\section{Introduction to Spark}
\subsection{What is Spark?}
{
\pagecolor{white}
\usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[width=\paperwidth]{images/sparkweb.png}\hfil}\vfil}}%
\begin{frame}[plain]
\end{frame}
}
\begin{frame}
\frametitle{Quick definition}
{\center {\huge Apache Spark™ is a fast and general engine for large-scale data processing.\footnote{\url{http://spark.apache.org}}}}
On top of that:
\begin{itemize}
\item Open source (Top-level Apache project)
\item Plays well with other tools
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Architecture}
\includegraphics[width=\textwidth]{images/cluster-overview.png}
\end{frame}
\begin{frame}
\frametitle{Programs}
\center
\includegraphics[height=.8\textheight]{images/sparkapp.png}
\end{frame}
\begin{frame}
\frametitle{Ecosystem}
\includegraphics[width=\textwidth]{images/sparkecosystem.png}
\end{frame}
\subsection{vs MapReduce}
{
\pagecolor{white}
\usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[width=\paperwidth]{images/hadoop-spark.png}\hfil}\vfil}}%
\begin{frame}[plain]
% \center
% \begin{tikzpicture}[remember picture,overlay]
% \node[at=(current page.center)] {
% \includegraphics[height=\paperheight]{images/justwaithere.jpg}
% };
% \end{tikzpicture}
\end{frame}
}
\begin{frame}
\frametitle{Comparison to MapReduce}
\begin{itemize}
\item In-memory data
\begin{itemize}
\item Less i/o overhead
\item Faster operations
\item Caching
\end{itemize}
\item Better for recursive tasks (e.g. machine learning)
\item Some libraries are dropping MapReduce support
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Contributors to Spark/Hadoop 2014}
\includegraphics[width=\textwidth]{images/hadoopsparkcontribs.png}
\end{frame}
\begin{frame}
\frametitle{Project status}
\center
\includegraphics[width=\textwidth]{images/sparkgh.png}
\includegraphics[height=.35\textheight]{images/contributors.png}
\end{frame}
\subsection{Key concepts}
\begin{frame}
\frametitle{Overview}
% {\huge Two core concepts}
% \vfill
\begin{columns}[t]
\column{0.5\textwidth}
Data (RDDs/Datasets)
\begin{itemize}
\item RDD: Resilient Distributed Dataset
\item Collections of objects spread across a cluster, stored in RAM or on Disk
\item Built through parallel transformations
\item Automatically rebuilt on failure
\end{itemize}
\column{0.5\textwidth}
Operations
\begin{itemize}
\item Transformations (e.g. group, map, groupBy)
\item Actions (e.g. count, collect, save)
\end{itemize}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Transformations and actions}
\includegraphics[width=\textwidth]{images/sparktransformations.png}
\end{frame}
\begin{frame}
\frametitle{RDDs vs Datasets}
Datasets are the future
\begin{itemize}
\item More memory efficient
\item Libraries dropping support for RDDs
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{RDDs vs Datasets}
\includegraphics[width=\textwidth]{images/performance-wordcount-databricks.png}
\includegraphics[width=\textwidth]{images/Memory-Usage-when-Caching-Chart-databricks.png}
\end{frame}
\begin{frame}[fragile]
\frametitle{Language support}
\begin{lstlisting}[title=Scala]
val lines = sc.textFile(...)
lines.filter(x => x.contains("ERROR")).count()
\end{lstlisting}
\begin{lstlisting}[title=Python,language=Python]
lines = sc.textFile(...)
lines.filter(lambda s: "ERROR" in s).count()
\end{lstlisting}
\begin{lstlisting}[title=Java,language=Java]
Removed, to keep the slides clean :)
\end{lstlisting}
\center
sc is the Spark Context (more or this later)
\end{frame}
\begin{frame}
\frametitle{Different flavors}
\begin{center}
\begin{tabular}{ | l | c | c | c |}
\hline
Language & App & REPL & Performance \\ \hline
Scala & Yes & Yes & \huge{\color{green}\smiley} \\
Java & Yes & No & \huge{\color{green}\smiley} \\
Python & Yes & Yes & \huge{\color{yellow}\smiley}\\ \hline
\end{tabular}
\end{center}
The Read-eval-print-loop (REPL) is the easiest way to get started and explore datasets. It is just a special Spark application that accepts user input (scala code).
\end{frame}
\section{Working with RDDs}
\begin{frame}[fragile]
\frametitle{Creation}
\begin{lstlisting}[title=From normal data structures,numbers=none]
val nums = sc.parallelize(List(1, 2, 3))
val cont = sc.parallelize(List(("a", 1),
("a", 1),
("b", 3)))
\end{lstlisting}
\begin{lstlisting}[title=From distributed/local sources,numbers=none]
sc.textFile('myfile')
\end{lstlisting}
\vfill
\vfill
Note: sc is the spark context in the Spark interpreter
\end{frame}
% #################################
% Starts Collect
% #################################
\begin{frame}[fragile]
\frametitle{Operations: collect}
\only<1>{
\btVFill
{\huge\ttfamily collect()}
}
\begin{onlyenv}<2->
\btVFill
\begin{lstlisting}[numbers=none,basicstyle=\Large\ttfamily,title={Runs any pending transformation and returns the real values},linebackgroundcolor={\btLstHL{1,3}}]
nums.collect()
> List(1, 2, 3)
cont.collect()
> List((a, 1), (a, 1), (b, 3))
\end{lstlisting}
\end{onlyenv}
\btVFill
Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}
\end{frame}
% #################################
% Ends Collect
% #################################
% #################################
% Starts Take
% #################################
\begin{frame}[fragile]
\frametitle{Operations: take}
\only<1>{
\btVFill
{\huge\ttfamily take(N)}
}
\begin{onlyenv}<2>
\btVFill
\begin{lstlisting}[title=Returns the N first elements,basicstyle=\Large\ttfamily,linebackgroundcolor={\btLstHL<2>{1,3}}]
nums.take(2)
> List(1, 2)
cont.take(1)
> List((a, 1))
\end{lstlisting}
\end{onlyenv}
\btVFill
Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}
\end{frame}
% #################################
% Ends Take
% #################################
% #################################
% Starts Count
% #################################
\begin{frame}[fragile]
\frametitle{Operations: count}
\only<1>{
\btVFill
{\huge\ttfamily count()}
}
\begin{onlyenv}<2>
\btVFill
\begin{lstlisting}[title=Returns the number of elements in a collection,basicstyle=\Large\ttfamily,linebackgroundcolor={\btLstHL<2>{1,3}}]
nums.count()
> 3
cont.count()
> 3
\end{lstlisting}
\end{onlyenv}
\btVFill
Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}
\end{frame}
% #################################
% Ends Count
% #################################
% #################################
% Starts filter
% #################################
\begin{frame}[fragile]
\frametitle{Operations: filter}
\btVFill
\only<1,2>{
{\huge\ttfamily filter(fn)}
\btVFill
}
\begin{onlyenv}<2>
This time, we need to define a function.
Filter applies that function to every element, and returns those where the function returns true.
For example:
\end{onlyenv}
\begin{onlyenv}<2,3>
\begin{lstlisting}[basicstyle=\ttfamily\Large]
val fn = (x:Int)) => x > 1
\end{lstlisting}
\end{onlyenv}
\begin{onlyenv}<3>
\btVFill
\begin{lstlisting}[title=Return a list containing the values where the function returns true,linebackgroundcolor={\btLstHL<3>{1}},basicstyle=\ttfamily\Large]
nums.filter(fn)
> List(2, 3)
\end{lstlisting}
\end{onlyenv}
\btVFill
Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}
\end{frame}
% #################################
% Ends filter
% #################################
\begin{frame}[fragile]
\frametitle{Quick aside: anonymous functions and underscores}
% Defining a function when it is only going to be used once is tedious and makes reading code harder.
\only<1,2>{
{\Large In scala, we can define ``anonymous functions'', also known as lambda functions.}
}
\pause
\begin{lstlisting}[basicstyle=\ttfamily\Large]
val fn = (x:Int)) => x > 1
cont.filter(fn)
\end{lstlisting}
\only<2>{
{\Large is equivalent to:}
}
\begin{lstlisting}[basicstyle=\ttfamily\Large]
cont.filter((x:Int) => x>1)
\end{lstlisting}
\pause
\only<3>{
{\Large Additionally, the scala compiler is smart enough to infer types in this example. Hence, we could simply write:}
}
\begin{lstlisting}[basicstyle=\ttfamily\Large]
cont.filter(x => x>1)
\end{lstlisting}
\pause
{\Large Furthermore, we could use underscores to replace arguments:}
\begin{lstlisting}[basicstyle=\ttfamily\huge]
cont.filter(_>1)
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\frametitle{Quick aside: anonymous functions and underscores}
{\large Every new argument in the lambda function represents a parameter
Hence, these two expresions are equivalent
}
\begin{lstlisting}[basicstyle=\ttfamily\huge]
_ + _
(x,y) => x+y
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\frametitle{Operations: filter}
\Large Our last example could be written more concisely as:
\vfill
\begin{lstlisting}[linebackgroundcolor={\btLstHL<1>{1}},basicstyle=\ttfamily\Large]
nums.filter(_>1)
> List(2, 3)
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\frametitle{Operations: filter}
\Large What would this filter do?
\vfill
\begin{lstlisting}[basicstyle=\ttfamily\Large,linebackgroundcolor={\btLstHL<1>{1}}]
nums.filter(_._1 == "a" && _._1 == 1)
> ???
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\frametitle{Operations: filter}
\begin{lstlisting}[basicstyle=\ttfamily\normalsize,language={}]
<console>:9: error: missing parameter type
for expanded function ((x$1, x$2) =>
x$1._1.$eq$eq(a).$amp$amp(x$2._1.$eq$eq(1)))
Note: The expected type requires a
one-argument function accepting a 2-Tuple.
\end{lstlisting}
\end{frame}
{
\pagecolor{black}
\usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[height=\paperheight]{images/whatsparrow.jpg}\hfil}\vfil}}%
\begin{frame}[plain]
\end{frame}
}
\begin{frame}[fragile]
\frametitle{Operations: filter}
Remember, each new underscore represents a new argument. So that expression expands to:
\begin{lstlisting}[language=TeX,basicstyle=\ttfamily\Large]
nums.filter((x, y) => x._1 == "a" &&
y._2 == 1)
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\frametitle{Operations: filter}
The right expression is:
\begin{lstlisting}[basicstyle=\ttfamily\Large,linebackgroundcolor={\btLstHL<1>{1,2}}]
nums.filter(x => x._1 == "a" &&
x._2 == 1)
> List((a, 1), (a, 1))
\end{lstlisting}
\end{frame}
% #################################
% Starts map
% #################################
\begin{frame}[fragile]
\frametitle{Operations: map}
\btVFill
\only<1>{
{\huge\ttfamily map(fn)}
\btVFill
}
\begin{onlyenv}<2>
\begin{lstlisting}[title=Apply a function to every item in the list,basicstyle=\Large\ttfamily,linebackgroundcolor={\btLstHL<2>{1,3}},basicstyle=\ttfamily\Large]
cont.map(x._2)
> List(1, 1, 3)
nums.map(_*3)
> List(3, 6, 9)
\end{lstlisting}
\end{onlyenv}
Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}
\end{frame}
% #################################
% Ends map
% #################################
% #################################
% Starts reduce
% #################################
\begin{frame}[fragile]
\frametitle{Operations: reduce}
\btVFill
\only<1>{
{\huge\ttfamily reduce(fn)}
\btVFill
}
\begin{onlyenv}<2>
\btVFill
\begin{lstlisting}[title={Merge elements with an associative function (concisely)},basicstyle=\Large\ttfamily,linebackgroundcolor={\btLstHL<2>{1,3,4}}]
nums.reduce(_+_)
> 6
cont.reduce((x, y) => (x._1+y._1,
x._2*y._2)
> (aab, 3)
\end{lstlisting}
\end{onlyenv}
\btVFill
Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}
\end{frame}
% #################################
% Ends reduce
% #################################
\begin{frame}[fragile]
\frametitle{Operations}
\begin{onlyenv}<1>
\btVFill
{\huge\ttfamily groupByKey()}
\btVFill
\end{onlyenv}
\btVFill
\begin{onlyenv}<2>
\begin{lstlisting}[title={Group elements of a list by the first item in the tuple}, numbers=none,basicstyle=\Large\ttfamily,linebackgroundcolor={\btLstHL<2>{1}}]
cont.groupByKey()
> [(b, [3]), (a, [1,1])]
\end{lstlisting}
\end{onlyenv}
\btVFill
Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\frametitle{Operations}
\btVFill
\begin{onlyenv}<1>
\btVFill
{\huge \ttfamily reduceByKey(fn)}
\btVFill
\btVFill
\end{onlyenv}
\begin{onlyenv}<2>
\begin{lstlisting}[title={Group by key and reduce each value},basicstyle=\huge\ttfamily,basicstyle=\Large\ttfamily,linebackgroundcolor={\btLstHL<2>{1}}]
cont.reduceByKey((x,y)=>x+y)
> [(b,3), (a,2)]
\end{lstlisting}
reduceByKey is more efficient than applying group, map and reduce separately. The reduce function can be given to each worker, which avoids passing unnecessary data.
\footnote{\href{https://databricks.gitbooks.io/databricks-spark-knowledge-base/content/best_practices/prefer_reducebykey_over_groupbykey.html}{Databricks' post on avoiding groupByKey }}
\end{onlyenv}
\btVFill
Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\frametitle{Operations}
And once you are done, save your results to a file.
\begin{lstlisting}[basicstyle=\Large\ttfamily]
nums.saveAsTextFile("hdfs://file.txt")
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\frametitle{Example: Search in logs}
\begin{lstlisting}
val lines = sc.textFile("hdfs://...")
val errors = lines.filter(s =>
s.startswith("ERROR"))
val messages = errors.map(s => s.split("\t")._2)
messages.cache()
messages.filter(s => s.contains("mysq")).count()
messages.filter(s => s.contains("php").count()
\end{lstlisting}
\end{frame}
% \begin{frame}
% \frametitle{Dependencies}
% \includegraphics[width=\textwidth]{images/RDDdependencies.png}
% \end{frame}
% \begin{frame}[fragile]
% \frametitle{Working with Key-value pairs}
% \begin{lstlisting}{title=Key-value pairs in different languages}
% pair = (a, b)
% pair[0] # => a
% pair[1] # => b
% val pair = (a, b)
% pair._1 // => a
% pair._2 // => b
% Tuple2 pair = new Tuple2(a, b);
% pair._1 // => a
% pair._2 // => b
% \end{lstlisting}
% \end{frame}
\section{Using Spark}
\begin{frame}[fragile]
\frametitle{Local deployment using docker-compose}
\center
{ \Large
\only<1>{
Get the repo
}
\only<2>{
Move to the repo
}
\only<3>{
Run all the containers
}
\only<4>{
Launch spark-shell inside the master container
}
}
\vfill
\begin{lstlisting}[basicstyle=\ttfamily,linebackgroundcolor={
\btLstHL<1>{1}%
\btLstHL<2>{2}%
\btLstHL<3>{3}%
\btLstHL<4>{4}%
}]
git clone http://github.com/gettyimages/docker-spark
cd docker-spark
docker-compose up
docker exec -it dockerspark_master_1 bin/spark-shell
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\frametitle{Demo}
\centering
\href{https://asciinema.org/a/92977}{\includegraphics[width=\textwidth]{gif/sparkrepl-thumb.png}}
\end{frame}
\begin{frame}[fragile]{Compose.yml Master}
\begin{lstlisting}[basicstyle=\ttfamily,language={},linebackgroundcolor={
\btLstHL<1>{2}%
\btLstHL<2>{3}%
\btLstHL<3>{11}%
\btLstHL<4>{12}%
}]
master:
image: gettyimages/spark
command: bin/spark-class org.apache.spark.deploy.master.Master -h master
hostname: master
environment:
MASTER: spark://master:7077
SPARK_CONF_DIR: /conf
SPARK_PUBLIC_DNS: localhost
... bunch of ports ...
volumes:
- ./conf/master:/conf
- ./data:/tmp/data
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]{Compose.yml Worker}
\begin{lstlisting}[language={},basicstyle=\ttfamily,linebackgroundcolor={
\btLstHL<1>{2}%
\btLstHL<2>{3}%
\btLstHL<3>{12,13}%
\btLstHL<4>{6-8}%
}]
worker:
image: gettyimages/spark
command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
hostname: worker
environment:
SPARK_CONF_DIR: /conf
SPARK_WORKER_CORES: 2
SPARK_WORKER_MEMORY: 1g
links:
- master
volumes:
- ./conf/worker:/conf
- ./data:/tmp/data
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\frametitle{Useful info}
\begin{itemize}
\item ./data folder is mounted as /tmp/data
\begin{itemize}
\item Copy your datasets there
\item Load them in the shell: \texttt{sc.textFile("/tmp/data/...")}
\end{itemize}
\item Master Web UI (localhost:8080)
\item Worker Web UI (localhost:8081)
\item REPL Web UI (localhost:4040 when launched)
\end{itemize}
\textbf{A word of caution}: as any other app, the shell reserves resources on startup, whether you are using them or not.
\end{frame}
\begin{frame}[fragile]
\frametitle{Applications}
Steps:
\begin{itemize}
\item Write the code
\item Compile the jar
\item Make your data available to every node in the cluster
\item Submit it to your cluster
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Writing applications}
\begin{lstlisting}[title=Example application,basicstyle=\ttfamily]
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object SparkWordCount {
def main(args: Array[String]) {
// create Spark context with Spark configuration
val sc = new SparkContext(new SparkConf().setAppName("Spark Example"))
... Your program ...
}
\end{lstlisting}
\end{frame}
\begin{frame}[fragile]
\frametitle{Running an aplication}
\begin{lstlisting}[language={},numbers=none]
./bin/spark-submit --class <main-class> \
--master <master-url> \
--deploy-mode <deploy-mode> \
--conf <key>=<value> \
... # other options
<application-jar> \
[application-arguments]
\end{lstlisting}
\end{frame}
\section{Full examples}
\begin{frame}[fragile]
\frametitle{Word frequency in wikipedia, revisited}
\begin{lstlisting}[title=Spark,basicstyle=\ttfamily,linebackgroundcolor={\color{orange!30}}]
val wiki = sc.textFile("Wikipedia.txt")
val counts = wiki.flatMap(line=> line.split(" ")
map(word => (word, 1)))
reduceByKey(_ + _)
\end{lstlisting}
\begin{lstlisting}[title=Pure scala,basicstyle=\ttfamily]
val wiki = scala.io.Source.fromFile("Wiki").getLines
wiki.flatMap(line=> line.split(" "))
map(x=>(x, 1)).toList
groupBy(x => x._1)
map({case (k, v) =>
(k, v.foldLeft(0)((a, b) => a+b._2)))
\end{lstlisting}
\end{frame}
\begin{frame}[plain]
\center
\huge{Shall we try it in the shell?}
\end{frame}
{
\pagecolor{black}
\usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[height=\paperheight]{images/justwaithere.jpg}\hfil}\vfil}}%
\begin{frame}[plain]
\end{frame}
}
\begin{frame}
\frametitle{Spark is not magic}
\begin{itemize}
\item We still have to add more resources
\item Caching may cause the spark version to use \textbf{more memory} (this can be configured)
\end{itemize}
\center
\pause
\huge{However, it allows us to scale our application}
\end{frame}
% \begin{frame}
% \frametitle{Word count}
% \includegraphics[width=\textwidth]{images/wordcount.png}
% \end{frame}
\begin{frame}[fragile]
\frametitle{Page rank}
\begin{columns}
\column{0.5\textwidth}
\includegraphics[width=\textwidth]{images/pagerank.png}
\column{0.5\textwidth}
\begin{itemize}
\item Created by Google
\item Rank given by links and their importance
\item Iterative (Perfect for Spark!)
\end{itemize}
\end{columns}
\begin{centering}
$
\text{PageRank of site} = \sum \frac{\text{Page rank of inbound link}}{\text{Number of links on that page}}
$
OR
$
PR(u) = (1-d)+d\times \sum \frac{PR(v)}{N(V)}
$
\end{centering}
\end{frame}
{
\pagecolor{white}
\usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[height=\paperheight]{images/pageranktoy.jpg}\hfil}\vfil}}%
\begin{frame}[plain]
% \includegraphics[width=\textwidth]{images/pageranktoy.jpg}
\footnote{\url{http://www.slideshare.net/sscdotopen/large-scale}}
\end{frame}
}
\begin{frame}
\frametitle{Page rank (code)}
\includegraphics[width=\textwidth]{images/example.png}
\end{frame}
\section*{Next week on SIBD}
\begin{frame}{Next week on SIBD}
\begin{itemize}
\item Advanced Spark configuration
\item Multiple hosts
\item Spark ecosystem
\item More examples in IBM BlueMix
\end{itemize}
\end{frame}
\section{Acknowledgements and useful links}
\begin{frame}
\begin{itemize}
\item \href{http://spark.apache.org/docs/latest/programming-guide.html}{Spark programming guide}
\item \href{https://databricks.com/blog/2016/01/04/introducing-apache-spark-datasets.html}{Databricks introducing apache spark datasets}
\item \href{https://www.safaribooksonline.com/library/view/data-analytics-with/9781491913734/ch04.html}{Data Analytics with Hadoop: In-Memory Computing with Spark}
\end{itemize}
\end{frame}
\end{document}