Reading the comma separated file from the input
directory one level up from where this document is stored using the read.csv()
function. Variable descriptions were obtained from King County, Department of Assessments.
All feature engineering should be done in one the first code chunks of your document.
housedata <- read.csv("../input/stt3851-fall2021/housedata2.csv",
colClasses = c(id = "character", date = "character",
yr_built = "character", zipcode = "factor", grade = "factor"))
housedata$date <- as.Date(housedata$date, "%Y%m%d")
housedata$waterfront <- factor(housedata$waterfront, labels = c("No", "Yes"))
housedata$condition <- factor(housedata$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedata$yr_renovated <- ifelse(housedata$yr_renovated == 0, housedata$yr_built, housedata$yr_renovated)
housedata$yr_built <- as.Date(ISOdate(housedata$yr_built, 9, 1)) # Complete Year, Sept 1
housedata$yr_renovated <- as.Date(ISOdate(housedata$yr_renovated, 9, 1)) # Last renovated Year, Sept 1
housedata <- housedata[, -1]
#### Perform same steps with test set
housedataT <- read.csv("../input/stt3851-fall2021/housedataTEST2.csv",
colClasses = c(id = "character", date = "character",
yr_built = "character", zipcode = "factor", grade = "factor"))
housedataT$date <- as.Date(housedataT$date, "%Y%m%d")
housedataT$waterfront <- factor(housedataT$waterfront, labels = c("No", "Yes"))
housedataT$condition <- factor(housedataT$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedataT$yr_renovated <- ifelse(housedataT$yr_renovated == 0, housedataT$yr_built, housedataT$yr_renovated)
housedataT$yr_built <- as.Date(ISOdate(housedataT$yr_built, 9, 1)) # Complete Year, Sept 1
housedataT$yr_renovated <- as.Date(ISOdate(housedataT$yr_renovated, 9, 1)) # Last renovated Year, Sept 1
housedataT <- housedataT[, -1]
library(DT)
datatable(housedata[, 2:10], rownames = FALSE)
Consider predicting the price (price
) of a house based on a certain feature (sqft_living
). Start by graphing the relationship.
library(ggplot2)
p1 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
geom_point() +
theme_bw()
p1
Overplotting is problematic. What should we do?
alpha
).alpha
p2 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
geom_point(alpha = 0.05, color = "blue") +
theme_bw()
p2
p3 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_bin2d(bins = 50) +
theme_bw()
p3
p4 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_bin2d(bins = 50) +
scale_fill_gradient(low = "lightblue", high = "red",
limits = c(0, 1000)) +
theme_bw()
p4
p5 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_binhex(bins = 50) +
scale_fill_gradient(low = "lightblue", high = "red",
limits = c(0, 800), breaks = seq(0, 800, by = 200)) +
theme_bw()
p5
Note For both stat_bin2d
and stat_binhex
, if you manually specify the range, and there is a bin that falls outside that range because it has too many of too few points, that bin will show up as grey rather than the color at the high or low end of the range. Observe the gray hexagons in the lower left corner of the above graph.
p6 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_binhex(bins = 50) +
scale_fill_gradient(low = "lightblue", high = "red",
limits = c(0, 1000), breaks = seq(0, 1000, by = 200)) +
theme_bw()
p6
library(MASS)
mod1 <- stepAIC(lm(price ~ . - sqft_basement - grade, data = housedata))
Start: AIC=418003.6
price ~ (date + bedrooms + bathrooms + sqft_living + sqft_lot +
floors + waterfront + view + condition + grade + sqft_above +
sqft_basement + yr_built + yr_renovated + zipcode + lat +
long + sqft_living15 + sqft_lot15) - sqft_basement - grade
Df Sum of Sq RSS AIC
<none> 4.7692e+14 418004
- sqft_lot15 1 1.0139e+11 4.7702e+14 418005
- lat 1 3.0675e+11 4.7723e+14 418013
- long 1 3.8871e+11 4.7731e+14 418016
- yr_renovated 1 1.1331e+12 4.7806e+14 418043
- sqft_lot 1 1.2683e+12 4.7819e+14 418048
- yr_built 1 1.6391e+12 4.7856e+14 418061
- bathrooms 1 2.2930e+12 4.7922e+14 418085
- sqft_living15 1 2.6372e+12 4.7956e+14 418098
- date 1 3.0969e+12 4.8002e+14 418114
- floors 1 4.3875e+12 4.8131e+14 418161
- condition 4 4.9913e+12 4.8191e+14 418177
- bedrooms 1 1.1230e+13 4.8815e+14 418406
- sqft_above 1 1.7292e+13 4.9421e+14 418621
- view 1 2.2762e+13 4.9968e+14 418812
- sqft_living 1 3.9806e+13 5.1673e+14 419395
- waterfront 1 4.7657e+13 5.2458e+14 419657
- zipcode 69 3.0970e+14 7.8662e+14 426564
summary(mod1)
Call:
lm(formula = price ~ (date + bedrooms + bathrooms + sqft_living +
sqft_lot + floors + waterfront + view + condition + grade +
sqft_above + sqft_basement + yr_built + yr_renovated + zipcode +
lat + long + sqft_living15 + sqft_lot15) - sqft_basement -
grade, data = housedata)
Residuals:
Min 1Q Median 3Q Max
-1210171 -71962 875 65345 4208423
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.766e+07 7.072e+06 -5.324 1.03e-07 ***
date 1.190e+02 1.123e+01 10.597 < 2e-16 ***
bedrooms -3.696e+04 1.832e+03 -20.180 < 2e-16 ***
bathrooms 2.771e+04 3.039e+03 9.119 < 2e-16 ***
sqft_living 1.531e+02 4.031e+00 37.994 < 2e-16 ***
sqft_lot 3.040e-01 4.483e-02 6.782 1.22e-11 ***
floors -4.606e+04 3.652e+03 -12.614 < 2e-16 ***
waterfrontYes 6.722e+05 1.617e+04 41.572 < 2e-16 ***
view 5.776e+04 2.010e+03 28.730 < 2e-16 ***
conditionfair 1.011e+05 3.692e+04 2.740 0.006157 **
conditionaverage 9.938e+04 3.413e+04 2.912 0.003593 **
conditiongood 1.206e+05 3.413e+04 3.533 0.000412 ***
conditionvery good 1.663e+05 3.435e+04 4.842 1.30e-06 ***
sqft_above 1.032e+02 4.119e+00 25.041 < 2e-16 ***
yr_built -2.359e+00 3.059e-01 -7.710 1.33e-14 ***
yr_renovated 2.006e+00 3.130e-01 6.410 1.49e-10 ***
zipcode98002 2.966e+04 1.675e+04 1.771 0.076570 .
zipcode98003 -7.230e+03 1.482e+04 -0.488 0.625640
zipcode98004 7.552e+05 2.711e+04 27.858 < 2e-16 ***
zipcode98005 2.694e+05 2.899e+04 9.292 < 2e-16 ***
zipcode98006 2.621e+05 2.361e+04 11.102 < 2e-16 ***
zipcode98007 2.223e+05 2.971e+04 7.484 7.56e-14 ***
zipcode98008 2.297e+05 2.836e+04 8.099 5.88e-16 ***
zipcode98010 9.409e+04 2.518e+04 3.736 0.000188 ***
zipcode98011 3.358e+04 3.709e+04 0.905 0.365312
zipcode98014 7.001e+04 4.095e+04 1.710 0.087361 .
zipcode98019 2.775e+04 3.988e+04 0.696 0.486515
zipcode98022 5.426e+04 2.217e+04 2.447 0.014425 *
zipcode98023 -4.581e+04 1.370e+04 -3.345 0.000826 ***
zipcode98024 1.558e+05 3.505e+04 4.446 8.81e-06 ***
zipcode98027 1.731e+05 2.432e+04 7.120 1.12e-12 ***
zipcode98028 1.893e+04 3.597e+04 0.526 0.598765
zipcode98029 2.291e+05 2.769e+04 8.276 < 2e-16 ***
zipcode98030 2.604e+03 1.619e+04 0.161 0.872209
zipcode98031 1.012e+04 1.698e+04 0.596 0.551208
zipcode98032 3.762e+03 1.980e+04 0.190 0.849290
zipcode98033 3.086e+05 3.080e+04 10.020 < 2e-16 ***
zipcode98034 1.263e+05 3.303e+04 3.825 0.000131 ***
zipcode98038 4.638e+04 1.826e+04 2.540 0.011087 *
zipcode98039 1.191e+06 3.582e+04 33.249 < 2e-16 ***
zipcode98040 4.968e+05 2.423e+04 20.506 < 2e-16 ***
zipcode98042 1.644e+04 1.565e+04 1.050 0.293520
zipcode98045 1.575e+05 3.384e+04 4.655 3.27e-06 ***
zipcode98052 1.853e+05 3.139e+04 5.903 3.64e-09 ***
zipcode98053 1.384e+05 3.353e+04 4.128 3.68e-05 ***
zipcode98055 2.000e+04 1.898e+04 1.053 0.292136
zipcode98056 6.045e+04 2.064e+04 2.929 0.003409 **
zipcode98058 2.646e+04 1.789e+04 1.479 0.139216
zipcode98059 5.807e+04 2.028e+04 2.863 0.004202 **
zipcode98065 7.522e+04 3.106e+04 2.422 0.015461 *
zipcode98070 -7.983e+04 2.403e+04 -3.322 0.000895 ***
zipcode98072 7.586e+04 3.668e+04 2.068 0.038638 *
zipcode98074 1.632e+05 2.964e+04 5.506 3.73e-08 ***
zipcode98075 1.574e+05 2.841e+04 5.541 3.05e-08 ***
zipcode98077 6.121e+04 3.830e+04 1.598 0.110053
zipcode98092 -1.538e+04 1.464e+04 -1.051 0.293431
zipcode98102 5.262e+05 3.235e+04 16.263 < 2e-16 ***
zipcode98103 2.691e+05 2.992e+04 8.994 < 2e-16 ***
zipcode98105 4.261e+05 3.061e+04 13.921 < 2e-16 ***
zipcode98106 8.902e+04 2.215e+04 4.019 5.86e-05 ***
zipcode98107 2.810e+05 3.071e+04 9.150 < 2e-16 ***
zipcode98108 7.561e+04 2.471e+04 3.059 0.002222 **
zipcode98109 4.446e+05 3.159e+04 14.072 < 2e-16 ***
zipcode98112 5.774e+05 2.805e+04 20.583 < 2e-16 ***
zipcode98115 2.634e+05 3.030e+04 8.694 < 2e-16 ***
zipcode98116 2.371e+05 2.466e+04 9.614 < 2e-16 ***
zipcode98117 2.440e+05 3.074e+04 7.937 2.19e-15 ***
zipcode98118 1.327e+05 2.150e+04 6.169 7.03e-10 ***
zipcode98119 4.405e+05 2.981e+04 14.774 < 2e-16 ***
zipcode98122 3.037e+05 2.672e+04 11.367 < 2e-16 ***
zipcode98125 1.157e+05 3.279e+04 3.528 0.000419 ***
zipcode98126 1.484e+05 2.284e+04 6.500 8.26e-11 ***
zipcode98133 6.772e+04 3.381e+04 2.003 0.045238 *
zipcode98136 1.992e+05 2.320e+04 8.587 < 2e-16 ***
zipcode98144 2.427e+05 2.479e+04 9.793 < 2e-16 ***
zipcode98146 4.570e+04 2.071e+04 2.206 0.027372 *
zipcode98148 4.276e+04 2.913e+04 1.468 0.142136
zipcode98155 4.914e+04 3.525e+04 1.394 0.163369
zipcode98166 1.335e+04 1.907e+04 0.700 0.483921
zipcode98168 3.261e+04 2.014e+04 1.620 0.105324
zipcode98177 1.261e+05 3.524e+04 3.578 0.000347 ***
zipcode98178 1.075e+03 2.055e+04 0.052 0.958296
zipcode98188 2.898e+03 2.097e+04 0.138 0.890117
zipcode98198 -2.719e+04 1.600e+04 -1.699 0.089311 .
zipcode98199 3.069e+05 2.915e+04 10.529 < 2e-16 ***
lat 2.443e+05 7.325e+04 3.335 0.000854 ***
long -1.957e+05 5.212e+04 -3.754 0.000174 ***
sqft_living15 3.154e+01 3.226e+00 9.779 < 2e-16 ***
sqft_lot15 -1.362e-01 7.104e-02 -1.918 0.055191 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 166100 on 17295 degrees of freedom
Multiple R-squared: 0.7993, Adjusted R-squared: 0.7982
F-statistic: 782.5 on 88 and 17295 DF, p-value: < 2.2e-16
PredictedPrice <- predict(mod1, newdata = housedataT)
head(PredictedPrice)
1 2 3 4 5 6
277523.1 857547.1 311085.6 604683.3 436416.6 536388.7
formatSubmission <- data.frame(id = 1:4229, price = PredictedPrice)
head(formatSubmission)
id price
1 1 277523.1
2 2 857547.1
3 3 311085.6
4 4 604683.3
5 5 436416.6
6 6 536388.7
write.csv(formatSubmission, "Example.csv", row.names = FALSE)
#```{r}
#model_log <- lm(log(PredictedPrice)~., data = housedataT) #plot(model_log, which=2) #summary(model_log)
#```
library(caret)
set.seed(123)
myControl <- trainControl(method = "cv", number = 5)
mod_rf <- train(y = housedata$price, x = housedata[ , c(3:15, 17:20)], trControl = myControl, method = "ranger")
summary(mod_rf)
Length Class Mode
predictions 17384 -none- numeric
num.trees 1 -none- numeric
num.independent.variables 1 -none- numeric
mtry 1 -none- numeric
min.node.size 1 -none- numeric
prediction.error 1 -none- numeric
forest 7 ranger.forest list
splitrule 1 -none- character
treetype 1 -none- character
r.squared 1 -none- numeric
call 9 -none- call
importance.mode 1 -none- character
num.samples 1 -none- numeric
replace 1 -none- logical
xNames 17 -none- character
problemType 1 -none- character
tuneValue 3 data.frame list
obsLevels 1 -none- logical
param 0 -none- list
Backwards Elimination
library(caret)
myControl5 <- trainControl(method = "cv",
number = 10)
mod_BE <- train(price ~ sqft_living + sqft_basement,
data = housedata,
trControl =myControl5,
method = "lm")
summary(mod_BE)
Call:
lm(formula = .outcome ~ ., data = dat)
Residuals:
Min 1Q Median 3Q Max
-1519267 -148056 -22909 105354 4327691
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -43982.635 4979.516 -8.833 < 2e-16 ***
sqft_living 277.557 2.413 115.046 < 2e-16 ***
sqft_basement 20.612 5.003 4.120 3.81e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 262800 on 17381 degrees of freedom
Multiple R-squared: 0.4946, Adjusted R-squared: 0.4945
F-statistic: 8504 on 2 and 17381 DF, p-value: < 2.2e-16
Download the Example.csv
file and upload it to kaggle to be evaluated.