Creating a Spark Project with SBT, IntelliJ, sbt-spark-package, and friends

Add sbt-spark-package

resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/maven/"
addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.6")
sparkVersion := "2.2.0"
sparkComponents ++= Seq("sql")

Set the Scala and SBT versions

scalaVersion := "2.11.12"
sbt.version = 0.13.17

Add the SparkSession

spark-pika/
src/
main/
scala/
com/
github/
mrpowers/
spark/
pika/
SparkSessionWrapper.scala
import org.apache.spark.sql.DataFrame
package com.github.mrpowers.spark.pika

import org.apache.spark.sql.SparkSession

trait SparkSessionWrapper {

lazy val spark: SparkSession = {
SparkSession
.builder()
.master("local")
.appName("spark pika")
.getOrCreate()
}

}

Write some code

package com.github.mrpowers.spark.pika

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._

object Tubular {

def withGoodVibes()(df: DataFrame): DataFrame = {
df.withColumn(
"chi",
lit("happy")
)
}

}
$ sbt console
> val df = List("sue", "fan").toDF("name")
> import com.github.mrpowers.spark.pika.Tubular
> val betterDF = df.transform(Tubular.withGoodVibes())
> betterDF.show()
+----+-----+
|name| chi|
+----+-----+
| sue|happy|
| fan|happy|
+----+-----+

Add some tests

libraryDependencies += "MrPowers" % "spark-fast-tests" % "2.2.0_0.5.0" % "test"
libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.1" % "test"
package com.github.mrpowers.spark.pika

import org.scalatest.FunSpec

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, StructField, StructType}

import com.github.mrpowers.spark.fast.tests.DataFrameComparer

class TubularSpec
extends FunSpec
with SparkSessionWrapper
with DataFrameComparer {

import spark.implicits._

describe("withGoodVibes") {

it("appends a chi column to a DataFrame") {

val sourceDF = List("sue", "fan").toDF("name")

val actualDF = sourceDF.transform(Tubular.withGoodVibes())

val expectedSchema = List(
StructField("name", StringType, true),
StructField("chi", StringType, false)
)

val expectedData = List(
Row("sue", "happy"),
Row("fan", "happy")
)

val expectedDF = spark.createDataFrame(
spark.sparkContext.parallelize(expectedData),
StructType(expectedSchema)
)

assertSmallDataFrameEquality(actualDF, expectedDF)

}

}

}

Cleaning up the test

libraryDependencies += "mrpowers" % "spark-daria" % "2.2.0_0.12.0" % "test"
package com.github.mrpowers.spark.pika

import org.scalatest.FunSpec

import org.apache.spark.sql.types.StringType

import com.github.mrpowers.spark.fast.tests.DataFrameComparer
import com.github.mrpowers.spark.daria.sql.SparkSessionExt._

class TubularSpec
extends FunSpec
with SparkSessionWrapper
with DataFrameComparer {

describe("withGoodVibes") {

it("appends a chi column to a DataFrame") {

val sourceDF = spark.createDF(
List(
"sue",
"fan"
), List(
("name", StringType, true)
)
)

val actualDF = sourceDF.transform(Tubular.withGoodVibes())

val expectedDF = spark.createDF(
List(
("sue", "happy"),
("fan", "happy")
), List(
("name", StringType, true),
("chi", StringType, false)
)
)

assertSmallDataFrameEquality(actualDF, expectedDF)

}

}

}

Building the JAR file

artifactName := { (sv: ScalaVersion, module: ModuleID, artifact: Artifact) =>
artifact.name + "_" + sv.binary + "-" + sparkVersion.value + "_" + module.revision + "." + artifact.extension
}

Next Steps

--

--

--

Spark coder, live in Colombia / Brazil / US, love Scala / Python / Ruby, working on empowering Latinos and Latinas in tech

Love podcasts or audiobooks? Learn on the go with our new app.

Recommended from Medium

What is AR & Which Programming Language Can Be Used In It?

The redefined state of application managed services

Want To Find The Best Schools Or Safest Cities in America ? CitySpire Does It All

How to Fix the “No Audio Output Device Is Installed” Error on Windows PC

Create simple REST APIs for Google Sheets

Powerful Google Sheets integration with CheapSheet API

Hands-on experience with AWS

Connecting Steampipe with Google BigQuery

AWS API Gateway — Ways to handle Request Timeout

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Matthew Powers

Matthew Powers

Spark coder, live in Colombia / Brazil / US, love Scala / Python / Ruby, working on empowering Latinos and Latinas in tech

More from Medium

Insights of Object-Oriented Programming in Scala

Why is Apache Spark faster than MapReduce?

Faster Java UDF in Pyspark

Dynamic Partition Upsert — SPARK