dataset = spark.read\.format(file_type)\.option("header", "true")\.schema(schema)\.load(file_location)## You can avoid defining a schema by having spark infer it from your data## This doesn't always work and can be slow#.option("inferSchema", "true")## Fill in na's, if needed# dataset = dataset.na.fill(0)display(dataset)
## Transform new data using the pipelinemydataset = mypipeline.transform(dataset)## Score new data using a trained modelscoreddataset = mymodel.bestModel.transform(mydataset)output = scoreddataset.select(col("id"),col("ReportingDate"),col("prediction").alias("MyForecast"))display(output)