In this blog post I have explore the n_estimators and Max_features hpyer_parameter to find which value are the best for the model training and showed how can we select best values.
import pandas as pd
bedrooms bathrooms m2_living floors m2_above m2_basement
1 3 1.50 124 1.5 124 0
2 5 2.50 339 2.0 313 26
3 3 2.00 179 1.0 179 0
4 3 2.25 186 1.0 93 93
5 4 2.50 180 1.0 106 74
6 2 1.00 82 1.0 82 0
m2_lot view quality yr_built renovated_last_5 city
1 735 0 3 1961 0 37
2 841 4 5 1927 1 36
3 1110 0 4 1972 1 19
4 746 0 4 1969 1 4
5 975 0 4 1982 0 32
6 593 0 3 1944 0 36
statezip price
1 63 313000
2 59 2384000
3 27 342000
4 8 420000
5 32 550000
6 55 490000
sum() r.df.isnull().
bedrooms 0
bathrooms 0
m2_living 0
floors 0
m2_above 0
m2_basement 0
m2_lot 0
view 0
quality 0
yr_built 0
renovated_last_5 0
city 0
statezip 0
price 0
dtype: int64
= r.df['price']
y = r.df.drop('price', axis = 1) X
from sklearn.model_selection import train_test_split
= train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_validation, y_train, y_validation
print('Shape of X_train = ', X_train.shape)
Shape of X_train = (3680, 13)
print('Shape of X_validation = ', y_train.shape)
Shape of X_validation = (3680,)
print('Shape of X_test = ', X_validation.shape)
Shape of X_test = (920, 13)
print('Shape of y_validation = ',y_validation.shape)
Shape of y_validation = (920,)
from sklearn.ensemble import RandomForestRegressor
= RandomForestRegressor(random_state= 0)
regressor1 regressor1.fit(X_train, y_train)
RandomForestRegressor(random_state=0)
= regressor1.predict(X_train)
train_pred = regressor1.predict(X_validation)
valid_pred
from sklearn.metrics import mean_absolute_error
= mean_absolute_error(y_train, train_pred)
train_mae = mean_absolute_error(y_validation, valid_pred)
valid_mae
print(f'Validation set mean absolute error is {round(valid_mae,2)}')
Validation set mean absolute error is 142365.36
In this part
= [] #stores MAE for training set for each n-estimators
train_mae2 = [] #stores MAE for validation set for each n-estimators
valid_mae2
for nesti in range(1,31):
= RandomForestRegressor(n_estimators=nesti, random_state= 0)
regressor2
regressor2.fit(X_train, y_train)= regressor2.predict(X_train)
train_pred2 = regressor2.predict(X_validation)
valid_pred2
train_mae2.append(mean_absolute_error(y_train, train_pred2)) valid_mae2.append(mean_absolute_error(y_validation, valid_pred2))
RandomForestRegressor(n_estimators=1, random_state=0)
RandomForestRegressor(n_estimators=2, random_state=0)
RandomForestRegressor(n_estimators=3, random_state=0)
RandomForestRegressor(n_estimators=4, random_state=0)
RandomForestRegressor(n_estimators=5, random_state=0)
RandomForestRegressor(n_estimators=6, random_state=0)
RandomForestRegressor(n_estimators=7, random_state=0)
RandomForestRegressor(n_estimators=8, random_state=0)
RandomForestRegressor(n_estimators=9, random_state=0)
RandomForestRegressor(n_estimators=10, random_state=0)
RandomForestRegressor(n_estimators=11, random_state=0)
RandomForestRegressor(n_estimators=12, random_state=0)
RandomForestRegressor(n_estimators=13, random_state=0)
RandomForestRegressor(n_estimators=14, random_state=0)
RandomForestRegressor(n_estimators=15, random_state=0)
RandomForestRegressor(n_estimators=16, random_state=0)
RandomForestRegressor(n_estimators=17, random_state=0)
RandomForestRegressor(n_estimators=18, random_state=0)
RandomForestRegressor(n_estimators=19, random_state=0)
RandomForestRegressor(n_estimators=20, random_state=0)
RandomForestRegressor(n_estimators=21, random_state=0)
RandomForestRegressor(n_estimators=22, random_state=0)
RandomForestRegressor(n_estimators=23, random_state=0)
RandomForestRegressor(n_estimators=24, random_state=0)
RandomForestRegressor(n_estimators=25, random_state=0)
RandomForestRegressor(n_estimators=26, random_state=0)
RandomForestRegressor(n_estimators=27, random_state=0)
RandomForestRegressor(n_estimators=28, random_state=0)
RandomForestRegressor(n_estimators=29, random_state=0)
RandomForestRegressor(n_estimators=30, random_state=0)
train_mae <- ToDataframe(py$train_mae2)
train_mae %>%
ggplot(aes(x=enu, y=error))+
geom_line() +
labs(title = "Distribution of MAE over n_estimators for training set",
x = "n_estimators",
y = "mean absolute error") +
theme_minimal()+
themes()
valid_mae <- ToDataframe(py$valid_mae2)
valid_mae %>%
ggplot(aes(x=enu, y=error))+
geom_line() +
labs(title = "Distribution of MAE over n_estimators for validation set",
x = "n_estimators",
y = "mean absolute error") +
theme_minimal()+
themes()
= valid_mae2.index(min(valid_mae2))
minimum2 print(f'Minimum Mean Absolute error is {round(min(valid_mae2),2)}\nMinimum error is at {minimum2 + 1} n_estimators ')
Minimum Mean Absolute error is 138483.76
Minimum error is at 4 n_estimators
Which value of n_estimators gives the best results for the validation set?
How I decided that this value for n_estimators gave the best results?
I created model for n_estimators from 1 to 30 and stored value in the list valid_mae2.
Then I compare all the results and looked for the minimum error that is how I decided the value of the n_estimators.
In this part
= [] #stores MAE for training set for each n-estimators
train_mae3 = [] #stores MAE for training set for each n-estimators
valid_mae3
for max_fea in range(1,14):
= RandomForestRegressor(n_estimators=4, max_features=max_fea, random_state= 0)
regressor3
regressor3.fit(X_train, y_train)= regressor3.predict(X_train)
train_pred3 = regressor3.predict(X_validation)
valid_pred3
train_mae3.append(mean_absolute_error(y_train, train_pred3)) valid_mae3.append(mean_absolute_error(y_validation, valid_pred3))
RandomForestRegressor(max_features=1, n_estimators=4, random_state=0)
RandomForestRegressor(max_features=2, n_estimators=4, random_state=0)
RandomForestRegressor(max_features=3, n_estimators=4, random_state=0)
RandomForestRegressor(max_features=4, n_estimators=4, random_state=0)
RandomForestRegressor(max_features=5, n_estimators=4, random_state=0)
RandomForestRegressor(max_features=6, n_estimators=4, random_state=0)
RandomForestRegressor(max_features=7, n_estimators=4, random_state=0)
RandomForestRegressor(max_features=8, n_estimators=4, random_state=0)
RandomForestRegressor(max_features=9, n_estimators=4, random_state=0)
RandomForestRegressor(max_features=10, n_estimators=4, random_state=0)
RandomForestRegressor(max_features=11, n_estimators=4, random_state=0)
RandomForestRegressor(max_features=12, n_estimators=4, random_state=0)
RandomForestRegressor(max_features=13, n_estimators=4, random_state=0)
train_mae <- ToDataframe(py$train_mae3)
train_mae %>%
ggplot(aes(x=enu, y=error))+
geom_line() +
labs(title = "Distribution of MAE over Max_features for training set",
x = "# of features",
y = "mean absolute error") +
theme_minimal()+
themes()
valid_mae <- ToDataframe(py$valid_mae3)
valid_mae %>%
ggplot(aes(x=enu, y=error))+
geom_line() +
labs(title = "Distribution of MAE over Max_features for validation set",
x = "# of features",
y = "mean absolute error") +
theme_minimal()+
themes()
## Best model performance
= valid_mae3.index(min(valid_mae3))
minimum3 print(f'Minimum Mean Absolute error is {round(min(valid_mae3),2)}\nMinimum error is at {minimum3 + 1} max_features ')
Minimum Mean Absolute error is 138483.76
Minimum error is at 13 max_features
For attribution, please cite this work as
patel (2021, June 11). Jaykumar Patel: Exploring hyper-parameter of Random forest. Retrieved from https://jaykumar-patel.netlify.app/python/2021-11-02-random-forest/
BibTeX citation
@misc{patel2021exploring, author = {patel, Jaykumar}, title = {Jaykumar Patel: Exploring hyper-parameter of Random forest}, url = {https://jaykumar-patel.netlify.app/python/2021-11-02-random-forest/}, year = {2021} }