dataset = spark.read\
.format(file_type)\
.option("header", "true")\
.schema(schema)\
.load(file_location)
## You can avoid defining a schema by having spark infer it from your data
## This doesn't always work and can be slow
#.option("inferSchema", "true")
## Fill in na's, if needed
# dataset = dataset.na.fill(0)
display(dataset)
Load in transformation pipeline and model
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml import PipelineModel
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col, round
from pyspark.ml.regression import GeneralizedLinearRegressionModel
mypipeline = PipelineModel.load("/mnt/trainedmodels/pipeline/")
mymodel = CrossValidatorModel.load("/mnt/trainedmodels/lr")
Score data using the model
## Transform new data using the pipeline
mydataset = mypipeline.transform(dataset)
## Score new data using a trained model
scoreddataset = mymodel.bestModel.transform(mydataset)
output = scoreddataset.select(col("id"),
col("ReportingDate"),
col("prediction").alias("MyForecast"))
display(output)