统计每一天的销售额(DSL风格)
需求: 数据源(时间,销售金额,用户ID)
在这个数据源中有不完整的记录,过滤掉这些数据
统计每一天的销售额(DSL风格)
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.sum
import org.apache.spark.sql.{
DataFrame, SparkSession}object _01_Exercise {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().master("local").appName("Exercise").getOrCreate()import spark.implicits._// 数据源(时间,销售金额,用户ID)val array: Array[String] = Array("2020-10-01,55,1122","2020-10-02,54,1134","2020-10-03,53","2020-10-01,56,1133","2020-10-02,60","2020-10-02,55,1133","2020-10-04,62","2020-10-03,70,1134","2020-10-04,53,1133","2020-10-06,56","2020-10-03,57,1133")// 1. 构建RDD,读取数据并过滤不符合条件的数据val rdd: RDD[String] = spark.sparkContext.parallelize(array)// 2. 过滤并转换数据val df: DataFrame = rdd.filter(_.split(",").length == 3) // 过滤条件,只保留三列都有数据的行.map(line => {
val parts: Array[String] = line.split(",")Log(parts(0), parts(1).toLong)}).toDF()// 3. 统计每一天的销售额df.groupBy("time").agg(sum('sal)).as("sum_sal").show()}}
case class Log(time: String, sal: Long)