While playing with Scala datasets, I stumbled upon a curious problem: when the case class Usage() is defined inside the scope of the main() function, I get an error. After browsing some docs, this has to do with the compiler not being able to find the case class in time for spark. Therefore the case class must be defined at the object-level or package-level scope.

This is something I want to dig deeper in…

package sparklearning

import scala.util.Random._
import org.apache.spark.sql.SparkSession

case class Usage(uid: Int, user: String, usage: Int)

object Datasets {
  val spark = SparkSession
    .builder()
    .appName("datasets")
    .master("local[*]")
    .getOrCreate()

  // case class 

  def main(args: Array[String]): Unit = {
    val r = new scala.util.Random(42)
    val data = for (i <- 0 to 1000)
      yield (Usage(i, "user-" + r.alphanumeric.take(5).mkString(""), r.nextInt(1000)))

    import spark.implicits._
    val ds = spark.createDataset(data)

    ds.show()
  }
}