Construct Simple Spark DataFrames Using Seq

Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!

https://medium.com/mrpowers/manually-creating-spark-dataframes-b14dae906393

Seq.toDF¶

toDF() provides a concise syntax for creating DataFrames and can be accessed after importing Spark implicits.

import spark.implicits._

SparkSession.createDataFrame¶

%%classpath add mvn
org.apache.spark spark-core_2.11 2.1.1
org.apache.spark spark-sql_2.11 2.1.1

import org.apache.spark.sql.{SparkSession, Row}
import org.apache.spark.sql.types._


val spark = SparkSession.builder()
    .master("local")
    .appName("createDF example")
    .config("spark.some.config.option", "some-value")
    .getOrCreate()

spark
import spark.implicits._

org.apache.spark.sql.SparkSession$implicits$@2d2d2d87

val someDF = Seq(
  (8, "bat"),
  (64, "mouse"),
  (-27, "horse")
).toDF("Number", "Words")

[Number: int, Words: string]

someDF.show()
someDF.printSchema()

+------+-----+
|Number|Words|
+------+-----+
|     8|  bat|
|    64|mouse|
|   -27|horse|
+------+-----+

root
 |-- Number: integer (nullable = false)
 |-- Words: string (nullable = true)

null


val someData = Seq(
  Row(8, "bat"),
  Row(64, "mouse"),
  Row(-27, "horse")
)

val someSchema = List(
  StructField("number", IntegerType, true),
  StructField("word", StringType, true)
)

val someDF = spark.createDataFrame(
  spark.sparkContext.parallelize(someData),
  StructType(someSchema)
)

[number: int, word: string]

Seq(1, 2, 3).toDF.show

createDF() is defined in spark-daria and allows for the following terse syntax.


// val someDF = spark.createDF(
//   List(
//     (8, "bat"),
//     (64, "mouse"),
//     (-27, "horse")
//   ), List(
//     ("number", IntegerType, true),
//     ("word", StringType, true)
//   )
// )

input is incomplete