\documentclass { beamer}
\usepackage [utf8] { inputenc}
\usepackage { media9}
\usepackage { lipsum}
\usepackage { amsmath}
\usepackage { amsfonts}
\usepackage { amssymb}
\usetheme { metropolis}
\usepackage { tcolorbox}
\usepackage { listings}
\usepackage { lstlinebgcolor}
\usepackage { MnSymbol}
\usepackage { wasysym}
\usepackage { animate}
\lstset { %
basicstyle=\ttfamily \large ,
columns=fullflexible,
escapeinside={ (*@} { @*)} ,
numbers=none,
breaklines=true,
numbersep=5pt, % how far the line-numbers are from the code
numberstyle=\tiny \color { gray} , % the style that is used for the line-numbers
numbersep=5pt, % how far the line-numbers are from the code
% postbreak={\hbox{\raisebox{0ex}[0ex][0ex]\color{red}{\hookrightarrow}\space}}
postbreak=\raisebox { 0ex} [0ex][0ex]{ \ensuremath { \rcurvearrowse \space } } ,
keywordstyle=\color { blue} , % keyword style
stringstyle=\color { red} , % string literal style
language=Scala,
% belowskip=0pt,
% aboveskip=0pt,
}
\definecolor { lightyellow} { RGB} { 255,255,204}
\title { Spark}
\subtitle { Cluster computing}
\author { J. Fernando Sánchez, Joaquín Salvachúa, Gabriel Huecas }
\institute { Universidad Politécnica de Madrid}
\date { 2016}
\newcommand { \btVFill } { \vskip 0pt plus 1filll}
\begin { document}
\begin { frame}
\titlepage { }
\end { frame}
\begin { frame} [allowframebreaks]
\frametitle { Outline}
\tableofcontents
\end { frame}
\section { Background}
\begin { frame}
\frametitle { LISP and functional programming}
\begin { columns}
\column { 0.5\textwidth }
\begin { itemize}
\item Higher level programming
\item Avoid side effects
\item Pattern matching
\end { itemize}
\column { 0.5\textwidth }
\includegraphics [width=\textwidth] { images/lisplogo.png}
\end { columns}
\end { frame}
\begin { frame}
\frametitle { Scala}
\begin { columns}
\column { 0.5\textwidth }
\includegraphics [width=\textwidth] { images/scalalogo.png}
\column { 0.5\textwidth }
\begin { itemize}
\item A \textit { better} Java
\item Functional programming (optional)
\item Actors for (coarse) concurrency
\end { itemize}
\end { columns}
\end { frame}
\begin { frame}
\frametitle { Docker}
\begin { columns}
\column { 0.5\textwidth }
\begin { itemize}
\item Easy and repeatable deployment
\item Lots of pre-built images @ hub.docker.com
\item Building block for other tools (swarm, compose, machine...)
\end { itemize}
\column { 0.5\textwidth }
\includegraphics [width=\textwidth] { images/docker_ logo.png}
\end { columns}
\end { frame}
{
\pagecolor { black}
\usebackgroundtemplate { \vbox to \paperheight { \vfil \hbox to \paperwidth { \hfil \includegraphics [height=\paperheight] { images/knowscala.jpg} \hfil } \vfil } } %
\begin { frame} [plain]
\end { frame}
}
{
\pagecolor { black}
\usebackgroundtemplate { \vbox to \paperheight { \vfil \hbox to \paperwidth { \hfil \includegraphics [height=\paperheight] { images/knowbigdata.jpg} \hfil } \vfil } } %
\begin { frame} [plain]
\end { frame}
}
% \begin{frame}[plain]
% \center
% \huge{Show me}
% \end{frame}
{
\pagecolor { white}
\usebackgroundtemplate { \vbox to \paperheight { \vfil \hbox to \paperwidth { \hfil \includegraphics [width=\paperwidth] { images/showme.png} \hfil } \vfil } } %
\begin { frame} [plain]
\end { frame}
}
\begin { frame} [fragile]
\frametitle { Word count in Wikipedia}
Problem: find the frequency each word is used in Wikipedia.
We have the text of all wikipedia in a text file\footnote { By happy coincidence, the first two lines are ``hi world'' and ``hi''} . It begins like this:
\begin { lstlisting} [backgroundcolor=\color { lightyellow} ,language={ } ,postbreak={ } ,
breakautoindent=false, breakindent=0pt, breaklines]
hi world
hi
Scala (SKAH-lah)[9] is a general-purpose programming language. Scala has full support for functional programming and a strong static type system. Designed to be concise,[10] many of Scala's design decisions were inspired by criticism of Java's shortcomings.[8]
\end { lstlisting}
\end { frame}
\begin { frame} [fragile]
\frametitle { Algorithm}
\begin { itemize}
\item Read every line
\item Chunk every line into words
\item Count every occurrence
\item For every word, sum its occurrences
\end { itemize}
\end { frame}
\begin { frame} [fragile]
\frametitle { Possible results of every step}
\begin { lstlisting} [language={ } ,numbers=none,linebackgroundcolor={
\btLstHL <1>{ 1} %
\btLstHL <2>{ 2} %
\btLstHL <3>{ 3,4} %
\btLstHL <4>{ 5} %
} ]
(("hi world"), ("hi") ...)
List((hi, 1), (hi, 1), (world, 1) ...)
Map(hi ->(("hi", 1), (hi, 1)),
world -> ((world, 1)) ...)
Map(hi-> 2, world -> 1 ...)
\end { lstlisting}
\end { frame}
\begin { frame} [fragile]
\frametitle { Running the scala shell}
We will use docker.
\begin { lstlisting} [language=bash,numbers=none]
docker run -it -v $ PWD:Wikipedia.txt:Wiki \
--rm williamyeh/scala
\end { lstlisting}
\end { frame}
\begin { frame} [fragile]
\frametitle { Scala code}
%[linebackgroundcolor={
% \btLstHL<1>{1} %
% \btLstHL<2>{3} %
% \btLstHL<3>{5} %
% \btLstHL<4>{7} %
% \btLstHL<5>{9} %
% }]
\begin { lstlisting} [language=Scala,linebackgroundcolor={
\btLstHL <1>{ 2} %
\btLstHL <2>{ 3} %
\btLstHL <3>{ 4} %
\btLstHL <4>{ 5} %
\btLstHL <5>{ 6,7} %
} ]
import scala.io.Source
val wiki = Source.fromFile("Wiki").getLines
wiki.flatMap(line=> line.split(" "))
map(x=>(x, 1)).toList
groupBy(x => x._ 1)
map({ case (k, v) =>
(k, v.foldLeft(0)((a, b) => a+b._ 2)))
\end { lstlisting}
\end { frame}
\begin { frame}
\center
{ \huge Let's run it in the shell.}
\end { frame}
{
\pagecolor { black}
\usebackgroundtemplate { \vbox to \paperheight { \vfil \hbox to \paperwidth { \hfil \includegraphics [height=\paperheight] { images/justwaithere.jpg} \hfil } \vfil } } %
\begin { frame} [plain]
\end { frame}
}
\begin { frame}
\frametitle { Wikipedia is big}
\includegraphics [width=\textwidth] { images/outofmemory.jpg}
\end { frame}
\begin { frame}
\center
{ \huge What happened?}
\end { frame}
\begin { frame}
\frametitle { Limited resources}
\begin { itemize}
\item CPU limits our speed
\begin { itemize}
\item Multi-cores help...
\item ...but real parallelism is \textbf { hard}
\end { itemize}
\item RAM limits how much data you can process at the same time
\begin { itemize}
\item What if you need more than 128GB?
\item You could use more than one computer...
\item ... but cluster computing is even harder than ``local'' parallelism
\end { itemize}
\item Functional programming helps a bit
\end { itemize}
\end { frame}
\begin { frame} [plain]
\center
\huge But... this was supposed to be fun, wasn't it?
\end { frame}
\section { Introduction to Spark}
\subsection { What is Spark?}
{
\pagecolor { white}
\usebackgroundtemplate { \vbox to \paperheight { \vfil \hbox to \paperwidth { \hfil \includegraphics [width=\paperwidth] { images/sparkweb.png} \hfil } \vfil } } %
\begin { frame} [plain]
\end { frame}
}
\begin { frame}
\frametitle { Quick definition}
{ \center { \huge Apache Spark™ is a fast and general engine for large-scale data processing.\footnote { \url { http://spark.apache.org} } } }
On top of that:
\begin { itemize}
\item Open source (Top-level Apache project)
\item Plays well with other tools
\end { itemize}
\end { frame}
\begin { frame}
\frametitle { Architecture}
\includegraphics [width=\textwidth] { images/cluster-overview.png}
\end { frame}
\begin { frame}
\frametitle { Programs}
\center
\includegraphics [height=.8\textheight] { images/sparkapp.png}
\end { frame}
\begin { frame}
\frametitle { Ecosystem}
\includegraphics [width=\textwidth] { images/sparkecosystem.png}
\end { frame}
\subsection { vs MapReduce}
{
\pagecolor { white}
\usebackgroundtemplate { \vbox to \paperheight { \vfil \hbox to \paperwidth { \hfil \includegraphics [width=\paperwidth] { images/hadoop-spark.png} \hfil } \vfil } } %
\begin { frame} [plain]
% \center
% \begin{tikzpicture}[remember picture,overlay]
% \node[at=(current page.center)] {
% \includegraphics[height=\paperheight]{images/justwaithere.jpg}
% };
% \end{tikzpicture}
\end { frame}
}
\begin { frame}
\frametitle { Comparison to MapReduce}
\begin { itemize}
\item In-memory data
\begin { itemize}
\item Less i/o overhead
\item Faster operations
\item Caching
\end { itemize}
\item Better for recursive tasks (e.g. machine learning)
\item Some libraries are dropping MapReduce support
\end { itemize}
\end { frame}
\begin { frame}
\frametitle { Contributors to Spark/Hadoop 2014}
\includegraphics [width=\textwidth] { images/hadoopsparkcontribs.png}
\end { frame}
\begin { frame}
\frametitle { Project status}
\center
\includegraphics [width=\textwidth] { images/sparkgh.png}
\includegraphics [height=.35\textheight] { images/contributors.png}
\end { frame}
\subsection { Key concepts}
\begin { frame}
\frametitle { Overview}
% {\huge Two core concepts}
% \vfill
\begin { columns} [t]
\column { 0.5\textwidth }
Data (RDDs/Datasets)
\begin { itemize}
\item RDD: Resilient Distributed Dataset
\item Collections of objects spread across a cluster, stored in RAM or on Disk
\item Built through parallel transformations
\item Automatically rebuilt on failure
\end { itemize}
\column { 0.5\textwidth }
Operations
\begin { itemize}
\item Transformations (e.g. group, map, groupBy)
\item Actions (e.g. count, collect, save)
\end { itemize}
\end { columns}
\end { frame}
\begin { frame}
\frametitle { Transformations and actions}
\includegraphics [width=\textwidth] { images/sparktransformations.png}
\end { frame}
\begin { frame}
\frametitle { RDDs vs Datasets}
Datasets are the future
\begin { itemize}
\item More memory efficient
\item Libraries dropping support for RDDs
\end { itemize}
\end { frame}
\begin { frame}
\frametitle { RDDs vs Datasets}
\includegraphics [width=\textwidth] { images/performance-wordcount-databricks.png}
\includegraphics [width=\textwidth] { images/Memory-Usage-when-Caching-Chart-databricks.png}
\end { frame}
\begin { frame} [fragile]
\frametitle { Language support}
\vspace { -2em}
\begin { lstlisting} [title=Scala]
val lines = sc.textFile(...)
lines.filter(x => x.contains("ERROR")).count()
\end { lstlisting}
\begin { lstlisting} [title=Python,language=Python]
lines = sc.textFile(...)
lines.filter(lambda s: "ERROR" in s).count()
\end { lstlisting}
\begin { lstlisting} [title=Java,language=Java]
Removed, to keep the slides clean :)
\end { lstlisting}
\center
sc is the Spark Context (more or this later)
\end { frame}
\begin { frame}
\frametitle { Different flavors}
\begin { center}
\begin { tabular} { | l | c | c | c |}
\hline
Language & App & REPL & Performance \\ \hline
Scala & Yes & Yes & \huge { \color { green} \smiley } \\
Java & Yes & No & \huge { \color { green} \smiley } \\
Python & Yes & Yes & \huge { \color { yellow} \smiley } \\ \hline
\end { tabular}
\end { center}
The Read-eval-print-loop (REPL) is the easiest way to get started and explore datasets. It is just a special Spark application that accepts user input (scala code).
\end { frame}
\section { Working with RDDs}
\begin { frame} [fragile]
\frametitle { Creation}
\begin { lstlisting} [title=From normal data structures,numbers=none]
val nums = sc.parallelize(List(1, 2, 3))
val cont = sc.parallelize(List(("a", 1),
("a", 1),
("b", 3)))
\end { lstlisting}
\begin { lstlisting} [title=From distributed/local sources,numbers=none]
sc.textFile('myfile')
\end { lstlisting}
\vfill
\vfill
Note: sc is the spark context in the Spark interpreter
\end { frame}
% #################################
% Starts Collect
% #################################
\begin { frame} [fragile]
\frametitle { Operations: collect}
\only <1>{
\btVFill
{ \huge \ttfamily collect()}
}
\begin { onlyenv} <2->
\btVFill
\begin { lstlisting} [numbers=none,basicstyle=\Large \ttfamily ,title={ Runs any pending transformation and returns the real values} ,linebackgroundcolor={ \btLstHL { 1,3} } ]
nums.collect()
> List(1, 2, 3)
cont.collect()
> List((a, 1), (a, 1), (b, 3))
\end { lstlisting}
\end { onlyenv}
\btVFill
Reminder:
\begin { lstlisting} [numbers=none,linebackgroundcolor={ \color { lightyellow} } ]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end { lstlisting}
\end { frame}
% #################################
% Ends Collect
% #################################
% #################################
% Starts Take
% #################################
\begin { frame} [fragile]
\frametitle { Operations: take}
\only <1>{
\btVFill
{ \huge \ttfamily take(N)}
}
\begin { onlyenv} <2>
\btVFill
\begin { lstlisting} [title=Returns the N first elements,basicstyle=\Large \ttfamily ,linebackgroundcolor={ \btLstHL <2>{ 1,3} } ]
nums.take(2)
> List(1, 2)
cont.take(1)
> List((a, 1))
\end { lstlisting}
\end { onlyenv}
\btVFill
Reminder:
\begin { lstlisting} [numbers=none,linebackgroundcolor={ \color { lightyellow} } ]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end { lstlisting}
\end { frame}
% #################################
% Ends Take
% #################################
% #################################
% Starts Count
% #################################
\begin { frame} [fragile]
\frametitle { Operations: count}
\only <1>{
\btVFill
{ \huge \ttfamily count()}
}
\begin { onlyenv} <2>
\btVFill
\begin { lstlisting} [title=Returns the number of elements in a collection,basicstyle=\Large \ttfamily ,linebackgroundcolor={ \btLstHL <2>{ 1,3} } ]
nums.count()
> 3
cont.count()
> 3
\end { lstlisting}
\end { onlyenv}
\btVFill
Reminder:
\begin { lstlisting} [numbers=none,linebackgroundcolor={ \color { lightyellow} } ]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end { lstlisting}
\end { frame}
% #################################
% Ends Count
% #################################
% #################################
% Starts filter
% #################################
\begin { frame} [fragile]
\frametitle { Operations: filter}
\btVFill
\only <1,2>{
{ \huge \ttfamily filter(fn)}
\btVFill
}
\begin { onlyenv} <2>
This time, we need to define a function.
Filter applies that function to every element, and returns those where the function returns true.
For example:
\end { onlyenv}
\begin { onlyenv} <2,3>
\begin { lstlisting} [basicstyle=\ttfamily \Large ]
val fn = (x:Int)) => x > 1
\end { lstlisting}
\end { onlyenv}
\begin { onlyenv} <3>
\btVFill
\begin { lstlisting} [title=Return a list containing the values where the function returns true,linebackgroundcolor={ \btLstHL <3>{ 1} } ,basicstyle=\ttfamily \Large ]
nums.filter(fn)
> List(2, 3)
\end { lstlisting}
\end { onlyenv}
\btVFill
Reminder:
\begin { lstlisting} [numbers=none,linebackgroundcolor={ \color { lightyellow} } ]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end { lstlisting}
\end { frame}
% #################################
% Ends filter
% #################################
\begin { frame} [fragile]
\frametitle { Quick aside: anonymous functions and underscores}
% Defining a function when it is only going to be used once is tedious and makes reading code harder.
\only <1,2>{
{ \Large In scala, we can define ``anonymous functions'', also known as lambda functions.}
}
\pause
\vspace { -2em} % Avoid overlapping with footer
\begin { lstlisting} [basicstyle=\ttfamily \Large ]
val fn = (x:Int)) => x > 1
cont.filter(fn)
\end { lstlisting}
\only <2>{
{ \Large is equivalent to:}
}
\begin { lstlisting} [basicstyle=\ttfamily \Large ]
cont.filter((x:Int) => x>1)
\end { lstlisting}
\pause
\only <3>{
{ \Large Additionally, the scala compiler is smart enough to infer types in this example. Hence, we could simply write:}
}
\begin { lstlisting} [basicstyle=\ttfamily \Large ]
cont.filter(x => x>1)
\end { lstlisting}
\pause
{ \Large Furthermore, we could use underscores to replace arguments:}
\vspace { -2em}
\begin { lstlisting} [basicstyle=\ttfamily \huge ]
cont.filter(_ >1)
\end { lstlisting}
\end { frame}
\begin { frame} [fragile]
\frametitle { Quick aside: anonymous functions and underscores}
{ \large Every new argument in the lambda function represents a parameter
Hence, these two expresions are equivalent
}
\begin { lstlisting} [basicstyle=\ttfamily \huge ]
_ + _
(x,y) => x+y
\end { lstlisting}
\end { frame}
\begin { frame} [fragile]
\frametitle { Operations: filter}
\Large Our last example could be written more concisely as:
\vfill
\begin { lstlisting} [linebackgroundcolor={ \btLstHL <1>{ 1} } ,basicstyle=\ttfamily \Large ]
nums.filter(_ >1)
> List(2, 3)
\end { lstlisting}
\end { frame}
\begin { frame} [fragile]
\frametitle { Operations: filter}
\Large What would this filter do?
\vfill
\begin { lstlisting} [basicstyle=\ttfamily \Large ,linebackgroundcolor={ \btLstHL <1>{ 1} } ]
cont.filter(_ ._ 1 == "a" & & _ ._ 1 == 1)
> ???
\end { lstlisting}
\end { frame}
\begin { frame} [fragile]
\frametitle { Operations: filter}
\begin { lstlisting} [basicstyle=\ttfamily \normalsize ,language={ } ]
<console>:9: error: missing parameter type
for expanded function ((x$ 1 , x $ 2) =>
x$ 1 . _ 1 . $ eq$ eq ( a ) . $ amp$ amp ( x $ 2._ 1.$ eq $ eq(1)))
Note: The expected type requires a
one-argument function accepting a 2-Tuple.
\end { lstlisting}
\end { frame}
{
\pagecolor { black}
\usebackgroundtemplate { \vbox to \paperheight { \vfil \hbox to \paperwidth { \hfil \includegraphics [height=\paperheight] { images/whatsparrow.jpg} \hfil } \vfil } } %
\begin { frame} [plain]
\end { frame}
}
\begin { frame} [fragile]
\frametitle { Operations: filter}
Remember, each new underscore represents a new argument. So that expression expands to:
\begin { lstlisting} [language=TeX,basicstyle=\ttfamily \Large ]
cont.filter((x, y) => x._ 1 == "a" & &
y._ 2 == 1)
\end { lstlisting}
\end { frame}
\begin { frame} [fragile]
\frametitle { Operations: filter}
The right expression is:
\begin { lstlisting} [basicstyle=\ttfamily \Large ,linebackgroundcolor={ \btLstHL <1>{ 1,2} } ]
cont.filter(x => x._ 1 == "a" & &
x._ 2 == 1)
> List((a, 1), (a, 1))
\end { lstlisting}
\end { frame}
% #################################
% Starts map
% #################################
\begin { frame} [fragile]
\frametitle { Operations: map}
\btVFill
\only <1>{
{ \huge \ttfamily map(fn)}
\btVFill
}
\begin { onlyenv} <2>
\begin { lstlisting} [title=Apply a function to every item in the list,basicstyle=\Large \ttfamily ,linebackgroundcolor={ \btLstHL <2>{ 1,3} } ,basicstyle=\ttfamily \Large ]
cont.map(x._ 2)
> List(1, 1, 3)
nums.map(_ *3)
> List(3, 6, 9)
\end { lstlisting}
\end { onlyenv}
Reminder:
\begin { lstlisting} [numbers=none,linebackgroundcolor={ \color { lightyellow} } ]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end { lstlisting}
\end { frame}
% #################################
% Ends map
% #################################
% #################################
% Starts reduce
% #################################
\begin { frame} [fragile]
\frametitle { Operations: reduce}
\btVFill
\only <1>{
{ \huge \ttfamily reduce(fn)}
\btVFill
}
\begin { onlyenv} <2>
\btVFill
\begin { lstlisting} [title={ Merge elements with an associative function (concisely)} ,basicstyle=\Large \ttfamily ,linebackgroundcolor={ \btLstHL <2>{ 1,3,4} } ]
nums.reduce(_ +_ )
> 6
cont.reduce((x, y) => (x._ 1+y._ 1,
x._ 2*y._ 2)
> (aab, 3)
\end { lstlisting}
\end { onlyenv}
\btVFill
Reminder:
\begin { lstlisting} [numbers=none,linebackgroundcolor={ \color { lightyellow} } ]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end { lstlisting}
\end { frame}
% #################################
% Ends reduce
% #################################
\begin { frame} [fragile]
\frametitle { Operations}
\begin { onlyenv} <1>
\btVFill
{ \huge \ttfamily groupByKey()}
\btVFill
\end { onlyenv}
\btVFill
\begin { onlyenv} <2>
\begin { lstlisting} [title={ Group elements of a list by the first item in the tuple} , numbers=none,basicstyle=\Large \ttfamily ,linebackgroundcolor={ \btLstHL <2>{ 1} } ]
cont.groupByKey()
> [(b, [3]), (a, [1,1])]
\end { lstlisting}
\end { onlyenv}
\btVFill
Reminder:
\begin { lstlisting} [numbers=none,linebackgroundcolor={ \color { lightyellow} } ]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end { lstlisting}
\end { frame}
\begin { frame} [fragile]
\frametitle { Operations}
\btVFill
\begin { onlyenv} <1>
\btVFill
{ \huge \ttfamily reduceByKey(fn)}
\btVFill
\btVFill
\end { onlyenv}
\begin { onlyenv} <2>
\begin { lstlisting} [title={ Group by key and reduce each value} ,basicstyle=\huge \ttfamily ,basicstyle=\Large \ttfamily ,linebackgroundcolor={ \btLstHL <2>{ 1} } ]
cont.reduceByKey((x,y)=>x+y)
> [(b,3), (a,2)]
\end { lstlisting}
reduceByKey is more efficient than applying group, map and reduce separately. The reduce function can be given to each worker, which avoids passing unnecessary data.
\footnote { \href { https://databricks.gitbooks.io/databricks-spark-knowledge-base/content/best_ practices/prefer_ reducebykey_ over_ groupbykey.html} { Databricks' post on avoiding groupByKey } }
\end { onlyenv}
\btVFill
Reminder:
\begin { lstlisting} [numbers=none,linebackgroundcolor={ \color { lightyellow} } ]
nums: List(1, 2, 3)
cont: List(("a", 1), ("a", 1), ("b", 3))
\end { lstlisting}
\end { frame}
\begin { frame} [fragile]
\frametitle { Operations}
And once you are done, save your results to a file.
\begin { lstlisting} [basicstyle=\Large \ttfamily ]
nums.saveAsTextFile("hdfs://file.txt")
\end { lstlisting}
\end { frame}
\begin { frame} [fragile]
\frametitle { Example: Search in logs}
\begin { lstlisting}
val lines = sc.textFile("hdfs://...")
val errors = lines.filter(s =>
s.startswith("ERROR"))
val messages = errors.map(s => s.split("\t ")._ 2)
messages.cache()
messages.filter(s => s.contains("mysq")).count()
messages.filter(s => s.contains("php").count()
\end { lstlisting}
\end { frame}
% \begin{frame}
% \frametitle{Dependencies}
% \includegraphics[width=\textwidth]{images/RDDdependencies.png}
% \end{frame}
% \begin{frame}[fragile]
% \frametitle{Working with Key-value pairs}
% \begin{lstlisting}{title=Key-value pairs in different languages}
% pair = (a, b)
% pair[0] # => a
% pair[1] # => b
% val pair = (a, b)
% pair._1 // => a
% pair._2 // => b
% Tuple2 pair = new Tuple2(a, b);
% pair._1 // => a
% pair._2 // => b
% \end{lstlisting}
% \end{frame}
\section { Using Spark}
\begin { frame} [fragile]
\frametitle { Local deployment using docker-compose}
\center
{ \Large
\only <1>{
Get the repo
}
\only <2>{
Move to the repo
}
\only <3>{
Run all the containers
}
\only <4>{
Launch spark-shell inside the master container
}
}
\vfill
\begin { lstlisting} [basicstyle=\ttfamily ,linebackgroundcolor={
\btLstHL <1>{ 1} %
\btLstHL <2>{ 2} %
\btLstHL <3>{ 3} %
\btLstHL <4>{ 4} %
} ]
git clone http://github.com/gettyimages/docker-spark
cd docker-spark
docker-compose up
docker exec -it dockerspark_ master_ 1 bin/spark-shell
\end { lstlisting}
\end { frame}
\begin { frame} [fragile]
\frametitle { Demo}
\centering
\href { https://asciinema.org/a/92977} { \includegraphics [width=\textwidth] { gif/sparkrepl-thumb.png} }
\end { frame}
\begin { frame} [fragile]{ Compose.yml Master}
\vspace { -2em} % Avoid overlapping with footer
\begin { lstlisting} [basicstyle=\ttfamily ,language={ } ,linebackgroundcolor={
\btLstHL <1>{ 2} %
\btLstHL <2>{ 3} %
\btLstHL <3>{ 11} %
\btLstHL <4>{ 12} %
} ]
master:
image: gettyimages/spark
command: bin/spark-class org.apache.spark.deploy.master.Master -h master
hostname: master
environment:
MASTER: spark://master:7077
SPARK_ CONF_ DIR: /conf
SPARK_ PUBLIC_ DNS: localhost
... bunch of ports ...
volumes:
- ./conf/master:/conf
- ./data:/tmp/data
\end { lstlisting}
\end { frame}
\begin { frame} [fragile]{ Compose.yml Worker}
\vspace { -2em} % Avoid overlapping with footer
\begin { lstlisting} [language={ } ,basicstyle=\ttfamily ,linebackgroundcolor={
\btLstHL <1>{ 2} %
\btLstHL <2>{ 3} %
\btLstHL <3>{ 12,13} %
\btLstHL <4>{ 6-8} %
} ]
worker:
image: gettyimages/spark
command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
hostname: worker
environment:
SPARK_ CONF_ DIR: /conf
SPARK_ WORKER_ CORES: 2
SPARK_ WORKER_ MEMORY: 1g
links:
- master
volumes:
- ./conf/worker:/conf
- ./data:/tmp/data
\end { lstlisting}
\end { frame}
\begin { frame} [fragile]
\frametitle { Useful info}
\begin { itemize}
\item ./data folder is mounted as /tmp/data
\begin { itemize}
\item Copy your datasets there
\item Load them in the shell: \texttt { sc.textFile("/tmp/data/...")}
\end { itemize}
\item Master Web UI (localhost:8080)
\item Worker Web UI (localhost:8081)
\item REPL Web UI (localhost:4040 when launched)
\end { itemize}
\textbf { A word of caution} : as any other app, the shell reserves resources on startup, whether you are using them or not.
\end { frame}
\begin { frame} [fragile]
\frametitle { Applications}
Steps:
\begin { itemize}
\item Write the code
\item Compile the jar
\item Make your data available to every node in the cluster
\item Submit it to your cluster
\end { itemize}
\end { frame}
\begin { frame} [fragile]
\frametitle { Writing applications}
\begin { lstlisting} [title=Example application,basicstyle=\ttfamily ]
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object SparkWordCount {
def main(args: Array[String]) {
// create Spark context with Spark configuration
val sc = new SparkContext(new SparkConf().setAppName("Spark Example"))
... Your program ...
}
\end { lstlisting}
\end { frame}
\begin { frame} [fragile]
\frametitle { Running an aplication}
\begin { lstlisting} [language={ } ,numbers=none]
./bin/spark-submit --class <main-class> \
--master <master-url> \
--deploy-mode <deploy-mode> \
--conf <key>=<value> \
... # other options
<application-jar> \
[application-arguments]
\end { lstlisting}
\end { frame}
\section { Full examples}
\begin { frame} [fragile]
\frametitle { Word frequency in wikipedia, revisited}
\begin { lstlisting} [title=Spark,basicstyle=\ttfamily ,linebackgroundcolor={ \color { orange!30} } ]
val wiki = sc.textFile("Wikipedia.txt")
val counts = wiki.flatMap(line=> line.split(" ")
map(word => (word, 1)))
reduceByKey(_ + _ )
\end { lstlisting}
\begin { lstlisting} [title=Pure scala,basicstyle=\ttfamily ]
val wiki = scala.io.Source.fromFile("Wiki").getLines
wiki.flatMap(line=> line.split(" "))
map(x=>(x, 1)).toList
groupBy(x => x._ 1)
map({ case (k, v) =>
(k, v.foldLeft(0)((a, b) => a+b._ 2)))
\end { lstlisting}
\end { frame}
\begin { frame} [plain]
\center
\huge { Shall we try it in the shell?}
\end { frame}
{
\pagecolor { black}
\usebackgroundtemplate { \vbox to \paperheight { \vfil \hbox to \paperwidth { \hfil \includegraphics [height=\paperheight] { images/justwaithere.jpg} \hfil } \vfil } } %
\begin { frame} [plain]
\end { frame}
}
\begin { frame}
\frametitle { Spark is not magic}
\begin { itemize}
\item We still have to add more resources
\item Caching may cause the spark version to use \textbf { more memory} (this can be configured)
\end { itemize}
\center
\pause
\huge { However, it allows us to scale our application}
\end { frame}
% \begin{frame}
% \frametitle{Word count}
% \includegraphics[width=\textwidth]{images/wordcount.png}
% \end{frame}
\begin { frame} [fragile]
\frametitle { Page rank}
\begin { columns}
\column { 0.5\textwidth }
\includegraphics [width=\textwidth] { images/pagerank.png}
\column { 0.5\textwidth }
\begin { itemize}
\item Created by Google
\item Rank given by links and their importance
\item Iterative (Perfect for Spark!)
\end { itemize}
\end { columns}
\begin { centering}
$
\text { PageRank of site} = \sum \frac { \text { Page rank of inbound link} } { \text { Number of links on that page} }
$
OR
$
PR(u) = (1-d)+d\times \sum \frac { PR(v)} { N(V)}
$
\end { centering}
\end { frame}
{
\pagecolor { white}
\usebackgroundtemplate { \vbox to \paperheight { \vfil \hbox to \paperwidth { \hfil \includegraphics [height=\paperheight] { images/pageranktoy.jpg} \hfil } \vfil } } %
\begin { frame} [plain]
% \includegraphics[width=\textwidth]{images/pageranktoy.jpg}
\footnote { \url { http://www.slideshare.net/sscdotopen/large-scale} }
\end { frame}
}
\begin { frame}
\frametitle { Page rank (code)}
\includegraphics [width=\textwidth] { images/example.png}
\end { frame}
\section * { Next week}
\begin { frame} { Next week}
\begin { itemize}
\item Advanced Spark configuration
\item Multiple hosts
\item Spark ecosystem
\item More examples in IBM BlueMix
\end { itemize}
\end { frame}
\section { Acknowledgements and useful links}
\begin { frame}
\begin { itemize}
\item \href { http://spark.apache.org/docs/latest/programming-guide.html} { Spark programming guide}
\item \href { https://databricks.com/blog/2016/01/04/introducing-apache-spark-datasets.html} { Databricks introducing apache spark datasets}
\item \href { https://www.safaribooksonline.com/library/view/data-analytics-with/9781491913734/ch04.html} { Data Analytics with Hadoop: In-Memory Computing with Spark}
\end { itemize}
\end { frame}
\end { document}