1. Preliminaries
- Scala code is build by Maven. For an example, refer to [1].
 - For a way to install Spark, refer to [2].
 - OS is CentOS7
 - CPU: AMD Ryzen 7 3700X 8-Core Processor
 
2. Sample codes
2.1 Accumulation
(1) Build the following code. This code is based on [3].
package package1
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
object App {
  def main(args : Array[String]){
    val conf = new SparkConf().setAppName("my-app").setMaster("local")
    val sc = new SparkContext(conf)
    val data = Array(1, 2, 3, 4, 5)
    val accum = sc.longAccumulator("my-app")
    var rdd = sc.parallelize(data)
    rdd.foreach(x => accum.add(x))
    println("Counter value: " + accum.value)    
  }
}
(2) Execute code by spark-submit[4].
~/spark-3.1.2/bin/spark-submit --class package1.App --master local[8] ./target/deequ-1.0-jar-with-dependencies.jar
2.2 Calculate π
(1) Build the following code. This code is based on [3].
package package1
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
object App {
  def main(args : Array[String]){
    val conf = new SparkConf().setAppName("my-app").setMaster("local")
    val sc = new SparkContext(conf)
    println(args(0))
    val NUM_SAMPLES = args(0).toInt
    val count = sc.parallelize(1 to NUM_SAMPLES).filter { _ =>
      val x = math.random
      val y = math.random
      x*x + y*y < 1
    }.count()
    println(s"Pi is roughly ${4.0 * count / NUM_SAMPLES}")
  }
}
(2) Execute code by spark-submit[4]. It takes 25sec. That is faster than PostgreSQL’s SQL[6] 150sec.
~/spark-3.1.2/bin/spark-submit --class package1.App --master local[8] ./target/deequ-1.0-jar-with-dependencies.jar 1000000000
2.3 Spark SQL
(1) Create a test dataset by the following python code with argument 600000000.
import random
import sys
args = sys.argv
random.seed(0)
print("value,dummy1,dummy2,dummy3,dummy4,dummy5")
for i in range(int(args[1])):
  print(str(random.random()) + "," + "01234567890123456789dummy1" + "," + "01234567890123456789dummy2" + "," + "01234567890123456789dummy3" + "," + "01234567890123456789dummy4" + "," + "01234567890123456789dummy5")
(2) Build and execute the following code which is based on [7]. The following sql takes 110sec. That is slower than PostgreSQL’s performance 7sec.
package package1
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import java.io.File
object App {
  case class RawData(productName: String, totalNumber: String, status: String, valuable: String)
  def main(args : Array[String]){
    val currentDir = new File(".").getAbsoluteFile().getParent()  
    println(currentDir)
    
    val spark = SparkSession
      .builder()
      .appName("Spark SQL basic example")
      .config("spark.some.config.option", "some-value")
      .getOrCreate()
    val df = spark.read.format("csv")
      .option("sep", ",")
      .option("inferSchema", "true")
      .option("header", "true")
      .load("./data.csv")
    df.show()
    df.createOrReplaceTempView("test")
    val start=System.nanoTime()
    val result = spark.sql("SELECT avg(value) as average_value FROM test")
    result.show()
    val end = System.nanoTime()
    println("Time elapsed: " + (end-start) / 1000000000.0 + " secs")
  }
}
3. References
[1] Various ways to print “Hello world” in Scala
[2] Installing Apache Spark from source
[3] RDD Programming Guide
https://spark.apache.org/docs/latest/rdd-programming-guide.html
[4] Submitting Applications
https://spark.apache.org/docs/latest/submitting-applications.html
[5] Apache Spark Examples
https://spark.apache.org/examples.html
[6] Calculate π in PostgreSQL
[7] Spark SQL, DataFrames and Datasets Guide
https://spark.apache.org/docs/2.3.0/sql-programming-guide.html