slides-intro-spark/spark1.tex

\documentclass{beamer}
\usepackage[utf8]{inputenc}
\usepackage{media9}
\usepackage{lipsum}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usetheme{metropolis}
\usepackage{tcolorbox}
\usepackage{listings}
\usepackage{lstlinebgcolor}
\usepackage{MnSymbol}
\usepackage{wasysym}
\usepackage{animate}

\lstset{%
  basicstyle=\ttfamily\large,
  columns=fullflexible,
  escapeinside={(*@}{@*)},
  numbers=none,
  breaklines=true,
  numbersep=5pt,                   % how far the line-numbers are from the code
  numberstyle=\tiny\color{gray}, % the style that is used for the line-numbers
  numbersep=5pt,                   % how far the line-numbers are from the code
  % postbreak={\hbox{\raisebox{0ex}[0ex][0ex]\color{red}{\hookrightarrow}\space}}
  postbreak=\raisebox{0ex}[0ex][0ex]{\ensuremath{\rcurvearrowse\space}},
  keywordstyle=\color{blue},       % keyword style
  stringstyle=\color{red},     % string literal style
  language=Scala,
  % belowskip=0pt,
  % aboveskip=0pt,
  }

\definecolor{lightyellow}{RGB}{255,255,204}

\title{Spark}
\subtitle{Cluster computing}
\author{J. Fernando Sánchez, Joaquín Salvachúa, Gabriel Huecas }
\institute{Universidad Politécnica de Madrid}
\date{2016}


\newcommand{\btVFill}{\vskip0pt plus 1filll}


\begin{document}

\begin{frame}
\titlepage{}
\end{frame}
\begin{frame}[allowframebreaks]
\frametitle{Outline}
\tableofcontents
\end{frame}

\section{Background}

\begin{frame}
  \frametitle{LISP and functional programming}
  \begin{columns}
\column{0.5\textwidth}
  \begin{itemize}
  \item Higher level programming
  \item Avoid side effects
  \item Pattern matching
  \end{itemize}
\column{0.5\textwidth}
  \includegraphics[width=\textwidth]{images/lisplogo.png}
\end{columns}
\end{frame}

\begin{frame}
  \frametitle{Scala}
  \begin{columns}
\column{0.5\textwidth}
  \includegraphics[width=\textwidth]{images/scalalogo.png}
\column{0.5\textwidth}
  \begin{itemize}
    \item A \textit{better} Java
    \item Functional programming (optional)
    \item Actors for (coarse) concurrency
  \end{itemize}
\end{columns}
\end{frame}

\begin{frame}
  \frametitle{Docker}
  \begin{columns}
\column{0.5\textwidth}
  \begin{itemize}
    \item Easy and repeatable deployment
    \item Lots of pre-built images @ hub.docker.com
    \item Building block for other tools (swarm, compose, machine...)
  \end{itemize}
\column{0.5\textwidth}
  \includegraphics[width=\textwidth]{images/docker_logo.png}
\end{columns}
\end{frame}

{
  \pagecolor{black}
  \usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[height=\paperheight]{images/knowscala.jpg}\hfil}\vfil}}%

\begin{frame}[plain]
\end{frame}
}
{
  \pagecolor{black}
  \usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[height=\paperheight]{images/knowbigdata.jpg}\hfil}\vfil}}%

\begin{frame}[plain]
\end{frame}
}
% \begin{frame}[plain]
%   \center
%   \huge{Show me}
% \end{frame}
{
  \pagecolor{white}
  \usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[width=\paperwidth]{images/showme.png}\hfil}\vfil}}%
\begin{frame}[plain]

\end{frame}
}

\begin{frame}[fragile]
\frametitle{Word count in Wikipedia}

Problem: find the frequency each word is used in Wikipedia.

We have the text of all wikipedia in a text file\footnote{By happy coincidence, the first two lines are ``hi world'' and ``hi''}. It begins like this:

\begin{lstlisting}[backgroundcolor=\color{lightyellow},language={},postbreak={},
  breakautoindent=false, breakindent=0pt, breaklines]
hi world
hi
Scala (SKAH-lah)[9] is a general-purpose programming language. Scala has full support for functional programming and a strong static type system. Designed to be concise,[10] many of Scala's design decisions were inspired by criticism of Java's shortcomings.[8]
\end{lstlisting}

\end{frame}

\begin{frame}[fragile]
\frametitle{Algorithm}

\begin{itemize}
  \item Read every line
  \item Chunk every line into words
  \item Count every occurrence
  \item For every word, sum its occurrences
\end{itemize}
\end{frame}

\begin{frame}[fragile]
\frametitle{Possible results of every step}
\begin{lstlisting}[language={},numbers=none,linebackgroundcolor={
      \btLstHL<1>{1}%
      \btLstHL<2>{2}%
      \btLstHL<3>{3,4}%
      \btLstHL<4>{5}%
    }]
(("hi world"), ("hi") ...)
List((hi, 1), (hi, 1), (world, 1) ...)
Map(hi ->(("hi", 1), (hi, 1)),
     world -> ((world, 1)) ...)
Map(hi-> 2, world -> 1 ...)
\end{lstlisting}
\end{frame}

\begin{frame}[fragile]
\frametitle{Running the scala shell}

We will use docker.

\begin{lstlisting}[language=bash,numbers=none]
docker run -it -v $PWD:Wikipedia.txt:Wiki \
            --rm williamyeh/scala
\end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Scala code}

%[linebackgroundcolor={
      % \btLstHL<1>{1} %
      % \btLstHL<2>{3} %
      % \btLstHL<3>{5} %
      % \btLstHL<4>{7} %
      % \btLstHL<5>{9} %
      % }]
\begin{lstlisting}[language=Scala,linebackgroundcolor={
      \btLstHL<1>{2}%
      \btLstHL<2>{3}%
      \btLstHL<3>{4}%
      \btLstHL<4>{5}%
      \btLstHL<5>{6,7}%
    }]
import scala.io.Source
val wiki = Source.fromFile("Wiki").getLines
wiki.flatMap(line=> line.split(" "))
      map(x=>(x, 1)).toList
      groupBy(x => x._1)
      map({case (k, v) =>
            (k, v.foldLeft(0)((a, b) => a+b._2)))
  \end{lstlisting}

\end{frame}

\begin{frame}

  \center
  {\huge Let's run it in the shell.}

\end{frame}

{
  \pagecolor{black}
  \usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[height=\paperheight]{images/justwaithere.jpg}\hfil}\vfil}}%

\begin{frame}[plain]
\end{frame}
}

\begin{frame}
  \frametitle{Wikipedia is big}
  \includegraphics[width=\textwidth]{images/outofmemory.jpg}
\end{frame}

\begin{frame}

  \center
  {\huge What happened?}

\end{frame}

\begin{frame}
  \frametitle{Limited resources}
  \begin{itemize}
  \item CPU limits our speed
    \begin{itemize}
    \item Multi-cores help...
    \item ...but real parallelism is \textbf{hard}
    \end{itemize}
  \item RAM limits how much data you can process at the same time
    \begin{itemize}
    \item What if you need more than 128GB?
    \item You could use more than one computer...
    \item ... but cluster computing is even harder than ``local'' parallelism
    \end{itemize}
  \item Functional programming helps a bit
  \end{itemize}
\end{frame}
\begin{frame}[plain]
  \center
  \huge But... this was supposed to be fun, wasn't it?
\end{frame}

\section{Introduction to Spark}

\subsection{What is Spark?}
{
  \pagecolor{white}
  \usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[width=\paperwidth]{images/sparkweb.png}\hfil}\vfil}}%
\begin{frame}[plain]

\end{frame}
}
\begin{frame}
\frametitle{Quick definition}
{\center {\huge Apache Spark™ is a fast and general engine for large-scale data processing.\footnote{\url{http://spark.apache.org}}}}

On top of that:
 \begin{itemize}
   \item Open source (Top-level Apache project)
   \item Plays well with other tools
   \end{itemize}
\end{frame}

\begin{frame}
  \frametitle{Architecture}
  \includegraphics[width=\textwidth]{images/cluster-overview.png}
\end{frame}

\begin{frame}
  \frametitle{Programs}
  \center
  \includegraphics[height=.8\textheight]{images/sparkapp.png}
\end{frame}

\begin{frame}
  \frametitle{Ecosystem}
  \includegraphics[width=\textwidth]{images/sparkecosystem.png}
\end{frame}

\subsection{vs MapReduce}

{
  \pagecolor{white}
  \usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[width=\paperwidth]{images/hadoop-spark.png}\hfil}\vfil}}%

\begin{frame}[plain]
% \center
% \begin{tikzpicture}[remember picture,overlay]
% \node[at=(current page.center)] {
%   \includegraphics[height=\paperheight]{images/justwaithere.jpg}
% };
% \end{tikzpicture}
\end{frame}
}

\begin{frame}
\frametitle{Comparison to MapReduce}
\begin{itemize}
  \item In-memory data
    \begin{itemize}
      \item Less i/o overhead
      \item Faster operations
      \item Caching
    \end{itemize}
  \item Better for recursive tasks (e.g. machine learning)
  \item Some libraries are dropping MapReduce support
\end{itemize}
\end{frame}

  \begin{frame}
    \frametitle{Contributors to Spark/Hadoop 2014}
    \includegraphics[width=\textwidth]{images/hadoopsparkcontribs.png}
  \end{frame}

\begin{frame}
\frametitle{Project status}
\center

\includegraphics[width=\textwidth]{images/sparkgh.png}

\includegraphics[height=.35\textheight]{images/contributors.png}
\end{frame}

\subsection{Key concepts}

\begin{frame}
\frametitle{Overview}
  % {\huge Two core concepts}
  % \vfill
\begin{columns}[t]
    \column{0.5\textwidth}
    Data (RDDs/Datasets)

    \begin{itemize}
    \item RDD: Resilient Distributed Dataset
    \item Collections of objects spread across a cluster, stored in RAM or on Disk
    \item Built through parallel transformations
    \item Automatically rebuilt on failure
    \end{itemize}
    \column{0.5\textwidth}
    Operations
    \begin{itemize}
    \item Transformations (e.g. group, map, groupBy)
    \item Actions (e.g. count, collect, save)
    \end{itemize}
\end{columns}
\end{frame}

\begin{frame}
\frametitle{Transformations and actions}
\includegraphics[width=\textwidth]{images/sparktransformations.png}
\end{frame}


\begin{frame}
  \frametitle{RDDs vs Datasets}
    Datasets are the future
    \begin{itemize}
      \item More memory efficient
      \item Libraries dropping support for RDDs
    \end{itemize}

\end{frame}
\begin{frame}
  \frametitle{RDDs vs Datasets}

    \includegraphics[width=\textwidth]{images/performance-wordcount-databricks.png}

    \includegraphics[width=\textwidth]{images/Memory-Usage-when-Caching-Chart-databricks.png}
  \end{frame}

\begin{frame}[fragile]
  \frametitle{Language support}
\vspace{-2em}
    \begin{lstlisting}[title=Scala]
val lines = sc.textFile(...)
lines.filter(x => x.contains("ERROR")).count()
\end{lstlisting}
    \begin{lstlisting}[title=Python,language=Python]
lines = sc.textFile(...)
lines.filter(lambda s: "ERROR" in s).count()
\end{lstlisting}
    \begin{lstlisting}[title=Java,language=Java]
      Removed, to keep the slides clean :)
\end{lstlisting}

\center
sc is the Spark Context (more or this later)
\end{frame}

\begin{frame}
  \frametitle{Different flavors}
  \begin{center}
    \begin{tabular}{ | l | c | c | c |}
      \hline
      Language & App & REPL & Performance \\ \hline
      Scala & Yes & Yes & \huge{\color{green}\smiley} \\
      Java & Yes & No & \huge{\color{green}\smiley} \\
      Python & Yes & Yes & \huge{\color{yellow}\smiley}\\ \hline
    \end{tabular}
  \end{center}
The Read-eval-print-loop (REPL) is the easiest way to get started and explore datasets. It is just a special Spark application that accepts user input (scala code).

\end{frame}

\section{Working with RDDs}

\begin{frame}[fragile]
\frametitle{Creation}

\begin{lstlisting}[title=From normal data structures,numbers=none]
val nums = sc.parallelize(List(1, 2, 3))
val cont = sc.parallelize(List(("a", 1),
                                   ("a", 1),
                                   ("b", 3)))
\end{lstlisting}

\begin{lstlisting}[title=From distributed/local sources,numbers=none]
sc.textFile('myfile')
\end{lstlisting}

\vfill
\vfill
Note: sc is the spark context in the Spark interpreter

\end{frame}

%  #################################
%  Starts Collect
%  #################################

\begin{frame}[fragile]
\frametitle{Operations: collect}
\only<1>{
  \btVFill
{\huge\ttfamily collect()}
}

\begin{onlyenv}<2->

\btVFill

\begin{lstlisting}[numbers=none,basicstyle=\Large\ttfamily,title={Runs any pending transformation and returns the real values},linebackgroundcolor={\btLstHL{1,3}}]
nums.collect()
> List(1, 2, 3)
cont.collect()
> List((a, 1), (a, 1), (b, 3))
\end{lstlisting}

\end{onlyenv}

\btVFill

Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}

\end{frame}

%  #################################
%  Ends Collect
%  #################################

%  #################################
%  Starts Take
%  #################################

\begin{frame}[fragile]
\frametitle{Operations: take}
\only<1>{
  \btVFill
{\huge\ttfamily take(N)}
}

\begin{onlyenv}<2>

\btVFill

\begin{lstlisting}[title=Returns the N first elements,basicstyle=\Large\ttfamily,linebackgroundcolor={\btLstHL<2>{1,3}}]
nums.take(2)
>  List(1, 2)
cont.take(1)
>  List((a, 1))
\end{lstlisting}

\end{onlyenv}

\btVFill

Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}

\end{frame}

%  #################################
%  Ends Take
%  #################################

%  #################################
%  Starts Count
%  #################################

\begin{frame}[fragile]
\frametitle{Operations: count}
\only<1>{
  \btVFill
{\huge\ttfamily count()}
}

\begin{onlyenv}<2>

\btVFill

\begin{lstlisting}[title=Returns the number of elements in a collection,basicstyle=\Large\ttfamily,linebackgroundcolor={\btLstHL<2>{1,3}}]
nums.count()
>  3
cont.count()
>  3
\end{lstlisting}

\end{onlyenv}

\btVFill

Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}

\end{frame}

%  #################################
%  Ends Count
%  #################################
%  #################################
%  Starts filter
%  #################################

\begin{frame}[fragile]
\frametitle{Operations: filter}
  \btVFill
\only<1,2>{
{\huge\ttfamily filter(fn)}
  \btVFill
}

\begin{onlyenv}<2>
This time, we need to define a function.

Filter applies that function to every element, and returns those where the function returns true.

For example:

\end{onlyenv}
\begin{onlyenv}<2,3>
\begin{lstlisting}[basicstyle=\ttfamily\Large]
val fn = (x:Int)) => x > 1
\end{lstlisting}
\end{onlyenv}
\begin{onlyenv}<3>

\btVFill

\begin{lstlisting}[title=Return a list containing the values where the function returns true,linebackgroundcolor={\btLstHL<3>{1}},basicstyle=\ttfamily\Large]
nums.filter(fn)
>  List(2, 3)
\end{lstlisting}
\end{onlyenv}

\btVFill

Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}

\end{frame}

%  #################################
%  Ends filter
%  #################################

\begin{frame}[fragile]
  \frametitle{Quick aside: anonymous functions and underscores}

% Defining a function when it is only going to be used once is tedious and makes reading code harder.

\only<1,2>{
{\Large In scala, we can define ``anonymous functions'', also known as lambda functions.}
}

\pause

\vspace{-2em} % Avoid overlapping with footer
\begin{lstlisting}[basicstyle=\ttfamily\Large]
val fn = (x:Int)) => x > 1
cont.filter(fn)
\end{lstlisting}

\only<2>{
{\Large is equivalent to:}
}

\begin{lstlisting}[basicstyle=\ttfamily\Large]
cont.filter((x:Int) => x>1)
\end{lstlisting}

\pause

\only<3>{
{\Large Additionally, the scala compiler is smart enough to infer types in this example. Hence, we could simply write:}
}

\begin{lstlisting}[basicstyle=\ttfamily\Large]
cont.filter(x => x>1)
\end{lstlisting}

\pause

{\Large Furthermore, we could use underscores to replace arguments:}

\vspace{-2em}

\begin{lstlisting}[basicstyle=\ttfamily\huge]
cont.filter(_>1)
\end{lstlisting}

\end{frame}

\begin{frame}[fragile]
  \frametitle{Quick aside: anonymous functions and underscores}

{\large Every new argument in the lambda function represents a parameter

Hence, these two expresions are equivalent
}

\begin{lstlisting}[basicstyle=\ttfamily\huge]
_ + _
(x,y) => x+y
\end{lstlisting}


\end{frame}

\begin{frame}[fragile]
  \frametitle{Operations: filter}

\Large Our last example could be written more concisely as:

\vfill

\begin{lstlisting}[linebackgroundcolor={\btLstHL<1>{1}},basicstyle=\ttfamily\Large]
nums.filter(_>1)
>  List(2, 3)
\end{lstlisting}

\end{frame}

\begin{frame}[fragile]
  \frametitle{Operations: filter}

\Large What would this filter do?

\vfill

\begin{lstlisting}[basicstyle=\ttfamily\Large,linebackgroundcolor={\btLstHL<1>{1}}]
cont.filter(_._1 == "a" && _._1 == 1)
>  ???
\end{lstlisting}

\end{frame}

\begin{frame}[fragile]
  \frametitle{Operations: filter}


\begin{lstlisting}[basicstyle=\ttfamily\normalsize,language={}]
<console>:9: error: missing parameter type
for expanded function ((x$1, x$2) =>
x$1._1.$eq$eq(a).$amp$amp(x$2._1.$eq$eq(1)))
Note: The expected type requires a
one-argument function accepting a 2-Tuple.
\end{lstlisting}
\end{frame}

{
  \pagecolor{black}
  \usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[height=\paperheight]{images/whatsparrow.jpg}\hfil}\vfil}}%

\begin{frame}[plain]
\end{frame}
}

\begin{frame}[fragile]
  \frametitle{Operations: filter}

Remember, each new underscore represents a new argument. So that expression expands to:

\begin{lstlisting}[language=TeX,basicstyle=\ttfamily\Large]
cont.filter((x, y) => x._1 == "a" &&
                   y._2 == 1)
\end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Operations: filter}

The right expression is:

\begin{lstlisting}[basicstyle=\ttfamily\Large,linebackgroundcolor={\btLstHL<1>{1,2}}]
cont.filter(x => x._1 == "a" &&
                   x._2 == 1)
> List((a, 1), (a, 1))
\end{lstlisting}
\end{frame}


%  #################################
%  Starts map
%  #################################

\begin{frame}[fragile]
\frametitle{Operations: map}
  \btVFill
\only<1>{
{\huge\ttfamily map(fn)}
  \btVFill
}

\begin{onlyenv}<2>
\begin{lstlisting}[title=Apply a function to every item in the list,basicstyle=\Large\ttfamily,linebackgroundcolor={\btLstHL<2>{1,3}},basicstyle=\ttfamily\Large]
cont.map(x._2)
>  List(1, 1, 3)
nums.map(_*3)
>  List(3, 6, 9)
\end{lstlisting}
\end{onlyenv}

Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}

\end{frame}


%  #################################
%  Ends map
%  #################################
%  #################################
%  Starts reduce
%  #################################

\begin{frame}[fragile]
\frametitle{Operations: reduce}
  \btVFill
\only<1>{
{\huge\ttfamily reduce(fn)}
  \btVFill
}

\begin{onlyenv}<2>

\btVFill

\begin{lstlisting}[title={Merge elements with an associative function (concisely)},basicstyle=\Large\ttfamily,linebackgroundcolor={\btLstHL<2>{1,3,4}}]
nums.reduce(_+_)
>  6
cont.reduce((x, y) => (x._1+y._1,
                          x._2*y._2)
>  (aab, 3)
\end{lstlisting}
\end{onlyenv}

\btVFill

Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}

\end{frame}

%  #################################
%  Ends reduce
%  #################################

\begin{frame}[fragile]
\frametitle{Operations}
\begin{onlyenv}<1>
  \btVFill
{\huge\ttfamily groupByKey()}
  \btVFill
\end{onlyenv}

\btVFill
\begin{onlyenv}<2>
  \begin{lstlisting}[title={Group elements of a list by the first item in the tuple}, numbers=none,basicstyle=\Large\ttfamily,linebackgroundcolor={\btLstHL<2>{1}}]
cont.groupByKey()
>  [(b, [3]), (a, [1,1])]
    \end{lstlisting}
\end{onlyenv}

\btVFill

Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}

\end{frame}
\begin{frame}[fragile]
\frametitle{Operations}
\btVFill
\begin{onlyenv}<1>
  \btVFill
{\huge \ttfamily reduceByKey(fn)}
  \btVFill
\btVFill
\end{onlyenv}
\begin{onlyenv}<2>
  \begin{lstlisting}[title={Group by key and reduce each value},basicstyle=\huge\ttfamily,basicstyle=\Large\ttfamily,linebackgroundcolor={\btLstHL<2>{1}}]
cont.reduceByKey((x,y)=>x+y)
>  [(b,3), (a,2)]
\end{lstlisting}

reduceByKey is more efficient than applying group, map and reduce separately. The reduce function can be given to each worker, which avoids passing unnecessary data.
\footnote{\href{https://databricks.gitbooks.io/databricks-spark-knowledge-base/content/best_practices/prefer_reducebykey_over_groupbykey.html}{Databricks' post on avoiding groupByKey }}
\end{onlyenv}

\btVFill

Reminder:
\begin{lstlisting}[numbers=none,linebackgroundcolor={\color{lightyellow}}]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end{lstlisting}

\end{frame}
\begin{frame}[fragile]
\frametitle{Operations}

And once you are done, save your results to a file.

  \begin{lstlisting}[basicstyle=\Large\ttfamily]
nums.saveAsTextFile("hdfs://file.txt")
  \end{lstlisting}

\end{frame}

\begin{frame}[fragile]
\frametitle{Example: Search in logs}
\begin{lstlisting}
val lines = sc.textFile("hdfs://...")
val errors = lines.filter(s =>
                        s.startswith("ERROR"))
val messages = errors.map(s => s.split("\t")._2)
messages.cache()
messages.filter(s => s.contains("mysq")).count()
messages.filter(s => s.contains("php").count()
\end{lstlisting}
\end{frame}

% \begin{frame}
% \frametitle{Dependencies}
% \includegraphics[width=\textwidth]{images/RDDdependencies.png}
% \end{frame}

% \begin{frame}[fragile]
% \frametitle{Working with Key-value pairs}
% \begin{lstlisting}{title=Key-value pairs in different languages}
% pair = (a, b)
%     pair[0] # => a
%     pair[1] # => b
% val pair = (a, b)
%     pair._1 // => a
%     pair._2 // => b
% Tuple2 pair = new Tuple2(a, b);
%     pair._1 // => a
%     pair._2 // => b
% \end{lstlisting}
% \end{frame}


\section{Using Spark}

\begin{frame}[fragile]
  \frametitle{Local deployment using docker-compose}

  \center
{  \Large
\only<1>{
  Get the repo
}

\only<2>{
  Move to the repo
}
\only<3>{
  Run all the containers
}
\only<4>{
  Launch spark-shell inside the master container
}
}
\vfill

  \begin{lstlisting}[basicstyle=\ttfamily,linebackgroundcolor={
      \btLstHL<1>{1}%
      \btLstHL<2>{2}%
      \btLstHL<3>{3}%
      \btLstHL<4>{4}%
    }]
git clone http://github.com/gettyimages/docker-spark
cd docker-spark
docker-compose up
docker exec -it dockerspark_master_1 bin/spark-shell
\end{lstlisting}

\end{frame}


\begin{frame}[fragile]
  \frametitle{Demo}
  \centering
\href{https://asciinema.org/a/92977}{\includegraphics[width=\textwidth]{gif/sparkrepl-thumb.png}}

\end{frame}
\begin{frame}[fragile]{Compose.yml Master}
\vspace{-2em} % Avoid overlapping with footer
\begin{lstlisting}[basicstyle=\ttfamily,language={},linebackgroundcolor={
    \btLstHL<1>{2}%
    \btLstHL<2>{3}%
    \btLstHL<3>{11}%
    \btLstHL<4>{12}%
    }]
master:
  image: gettyimages/spark
  command: bin/spark-class org.apache.spark.deploy.master.Master -h master
  hostname: master
  environment:
    MASTER: spark://master:7077
    SPARK_CONF_DIR: /conf
    SPARK_PUBLIC_DNS: localhost
    ... bunch of ports ...
  volumes:
    - ./conf/master:/conf
    - ./data:/tmp/data
  \end{lstlisting}
\end{frame}
\begin{frame}[fragile]{Compose.yml Worker}
\vspace{-2em} % Avoid overlapping with footer
\begin{lstlisting}[language={},basicstyle=\ttfamily,linebackgroundcolor={
    \btLstHL<1>{2}%
    \btLstHL<2>{3}%
    \btLstHL<3>{12,13}%
    \btLstHL<4>{6-8}%
    }]
worker:
  image: gettyimages/spark
  command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
  hostname: worker
  environment:
    SPARK_CONF_DIR: /conf
    SPARK_WORKER_CORES: 2
    SPARK_WORKER_MEMORY: 1g
  links:
    - master
  volumes:
    - ./conf/worker:/conf
    - ./data:/tmp/data
    \end{lstlisting}
\end{frame}
\begin{frame}[fragile]
  \frametitle{Useful info}
  \begin{itemize}
  \item ./data folder is mounted as /tmp/data
    \begin{itemize}
    \item Copy your datasets there
    \item Load them in the shell: \texttt{sc.textFile("/tmp/data/...")}
    \end{itemize}
  \item Master Web UI (localhost:8080)
  \item Worker Web UI (localhost:8081)
  \item REPL Web UI (localhost:4040 when launched)
    \end{itemize}
\textbf{A word of caution}: as any other app, the shell reserves resources on startup, whether you are using them or not.
\end{frame}

\begin{frame}[fragile]
  \frametitle{Applications}
  Steps:
  \begin{itemize}
    \item Write the code
    \item Compile the jar
    \item Make your data available to every node in the cluster
    \item Submit it to your cluster
  \end{itemize}
\end{frame}
\begin{frame}[fragile]
  \frametitle{Writing applications}
    \begin{lstlisting}[title=Example application,basicstyle=\ttfamily]
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf

object SparkWordCount {
  def main(args: Array[String]) {
    // create Spark context with Spark configuration
    val sc = new SparkContext(new SparkConf().setAppName("Spark Example"))
  ... Your program ...
}
\end{lstlisting}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Running an aplication}
    \begin{lstlisting}[language={},numbers=none]
./bin/spark-submit --class <main-class> \
                     --master <master-url> \
                     --deploy-mode <deploy-mode> \
                     --conf <key>=<value> \
                     ... # other options
                     <application-jar> \
                     [application-arguments]
\end{lstlisting}
\end{frame}

\section{Full examples}

\begin{frame}[fragile]
  \frametitle{Word frequency in wikipedia, revisited}
  \begin{lstlisting}[title=Spark,basicstyle=\ttfamily,linebackgroundcolor={\color{orange!30}}]
val wiki = sc.textFile("Wikipedia.txt")
val counts = wiki.flatMap(line=> line.split(" ")
                map(word => (word, 1)))
                reduceByKey(_ + _)
\end{lstlisting}

\begin{lstlisting}[title=Pure scala,basicstyle=\ttfamily]
val wiki = scala.io.Source.fromFile("Wiki").getLines
wiki.flatMap(line=> line.split(" "))
      map(x=>(x, 1)).toList
      groupBy(x => x._1)
      map({case (k, v) =>
            (k, v.foldLeft(0)((a, b) => a+b._2)))
  \end{lstlisting}
\end{frame}

\begin{frame}[plain]
  \center
  \huge{Shall we try it in the shell?}
\end{frame}

{
  \pagecolor{black}
  \usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[height=\paperheight]{images/justwaithere.jpg}\hfil}\vfil}}%

\begin{frame}[plain]
\end{frame}
}

\begin{frame}
  \frametitle{Spark is not magic}
  \begin{itemize}
    \item We still have to add more resources
    \item Caching may cause the spark version to use \textbf{more memory} (this can be configured)
  \end{itemize}
  \center
  \pause
  \huge{However, it allows us to scale our application}

\end{frame}


% \begin{frame}
%   \frametitle{Word count}
% \includegraphics[width=\textwidth]{images/wordcount.png}
% \end{frame}

\begin{frame}[fragile]
  \frametitle{Page rank}

  \begin{columns}
    \column{0.5\textwidth}
    \includegraphics[width=\textwidth]{images/pagerank.png}
    \column{0.5\textwidth}
    \begin{itemize}
    \item Created by Google
    \item Rank given by links and their importance
    \item Iterative (Perfect for Spark!)
    \end{itemize}
  \end{columns}
  \begin{centering}

$
 \text{PageRank of site} = \sum \frac{\text{Page rank of inbound link}}{\text{Number of links on that page}}
$

OR

$
 PR(u) = (1-d)+d\times \sum \frac{PR(v)}{N(V)}
$

\end{centering}
\end{frame}

{
  \pagecolor{white}
  \usebackgroundtemplate{\vbox to \paperheight{\vfil\hbox to \paperwidth{\hfil\includegraphics[height=\paperheight]{images/pageranktoy.jpg}\hfil}\vfil}}%

\begin{frame}[plain]
% \includegraphics[width=\textwidth]{images/pageranktoy.jpg}
\footnote{\url{http://www.slideshare.net/sscdotopen/large-scale}}

\end{frame}
}

\begin{frame}
\frametitle{Page rank (code)}
\includegraphics[width=\textwidth]{images/example.png}
\end{frame}

\section*{Next week}

\begin{frame}{Next week}
  \begin{itemize}
  \item Advanced Spark configuration
  \item Multiple hosts
  \item Spark ecosystem
  \item More examples in IBM BlueMix
  \end{itemize}
\end{frame}

\section{Acknowledgements and useful links}

\begin{frame}
  \begin{itemize}
\item \href{http://spark.apache.org/docs/latest/programming-guide.html}{Spark programming guide}
\item \href{https://databricks.com/blog/2016/01/04/introducing-apache-spark-datasets.html}{Databricks introducing apache spark datasets}
\item \href{https://www.safaribooksonline.com/library/view/data-analytics-with/9781491913734/ch04.html}{Data Analytics with Hadoop: In-Memory Computing with Spark}
  \end{itemize}
\end{frame}

\end{document}