The steps for this recipe are as follows:
- Import the necessary libraries:
import koalas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
- Import the data from Databricks Delta Lake:
df = spark.sql("select * from ChemicalSensor where class <> 'banana'")
pdf = df.toPandas()
- Select and encode the data:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
pdf.rename(columns = {'class':'classification'}, inplace = True)
X = pdf
y = pdf['classification']
label_encoder = LabelEncoder()
integer_encoded = \
label_encoder.fit_transform(pdf['classification'])
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
feature_cols = ['r1...