Reading the comma separated file from the input directory one level up from where this document is stored using the read.csv() function. Variable descriptions were obtained from King County, Department of Assessments.
All feature engineering should be done in one the first code chunks of your document.

housedata <- read.csv("../input/stt3851-fall2021/housedata2.csv", 
                      colClasses = c(id = "character", date = "character", 
                                     yr_built = "character", zipcode = "factor", grade = "factor"))
housedata$date <- as.Date(housedata$date, "%Y%m%d")
housedata$waterfront <- factor(housedata$waterfront, labels = c("No", "Yes"))
housedata$condition <- factor(housedata$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedata$yr_renovated <- ifelse(housedata$yr_renovated == 0, housedata$yr_built, housedata$yr_renovated)
housedata$yr_built <- as.Date(ISOdate(housedata$yr_built, 9, 1))  # Complete Year, Sept 1
housedata$yr_renovated <- as.Date(ISOdate(housedata$yr_renovated, 9, 1))  # Last renovated Year, Sept 1
housedata <- housedata[, -1]
#### Perform same steps with test set
housedataT <- read.csv("../input/stt3851-fall2021/housedataTEST2.csv", 
                      colClasses = c(id = "character", date = "character", 
                                     yr_built = "character", zipcode = "factor", grade = "factor"))
housedataT$date <- as.Date(housedataT$date, "%Y%m%d")
housedataT$waterfront <- factor(housedataT$waterfront, labels = c("No", "Yes"))
housedataT$condition <- factor(housedataT$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedataT$yr_renovated <- ifelse(housedataT$yr_renovated == 0, housedataT$yr_built, housedataT$yr_renovated)
housedataT$yr_built <- as.Date(ISOdate(housedataT$yr_built, 9, 1))  # Complete Year, Sept 1
housedataT$yr_renovated <- as.Date(ISOdate(housedataT$yr_renovated, 9, 1))  # Last renovated Year, Sept 1
housedataT <- housedataT[, -1]
library(DT)
datatable(housedata[, 2:10], rownames = FALSE)

Consider predicting the price (price) of a house based on a certain feature (sqft_living). Start by graphing the relationship.

library(ggplot2)
p1 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
  geom_point() + 
  theme_bw()
p1

Overplotting is problematic. What should we do?

Using alpha

p2 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        geom_point(alpha = 0.05, color = "blue") + 
        theme_bw() 
p2

Using rectangles

p3 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_bin2d(bins = 50) + 
        theme_bw()
p3

p4 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_bin2d(bins = 50) + 
        scale_fill_gradient(low = "lightblue", high = "red", 
                            limits = c(0, 1000)) +
        theme_bw()
p4

Using hexagons

p5 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_binhex(bins = 50) + 
        scale_fill_gradient(low = "lightblue", high = "red", 
                            limits = c(0, 800), breaks = seq(0, 800, by = 200)) +
        theme_bw()
p5

Note For both stat_bin2d and stat_binhex, if you manually specify the range, and there is a bin that falls outside that range because it has too many of too few points, that bin will show up as grey rather than the color at the high or low end of the range. Observe the gray hexagons in the lower left corner of the above graph.

p6 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_binhex(bins = 50) + 
        scale_fill_gradient(low = "lightblue", high = "red", 
                            limits = c(0, 1000), breaks = seq(0, 1000, by = 200)) +
        theme_bw()
p6

Example — Data Format from Building a Model

library(MASS)
mod1 <- stepAIC(lm(price ~ . - sqft_basement - grade, data = housedata))
Start:  AIC=418003.6
price ~ (date + bedrooms + bathrooms + sqft_living + sqft_lot + 
    floors + waterfront + view + condition + grade + sqft_above + 
    sqft_basement + yr_built + yr_renovated + zipcode + lat + 
    long + sqft_living15 + sqft_lot15) - sqft_basement - grade

                Df  Sum of Sq        RSS    AIC
<none>                        4.7692e+14 418004
- sqft_lot15     1 1.0139e+11 4.7702e+14 418005
- lat            1 3.0675e+11 4.7723e+14 418013
- long           1 3.8871e+11 4.7731e+14 418016
- yr_renovated   1 1.1331e+12 4.7806e+14 418043
- sqft_lot       1 1.2683e+12 4.7819e+14 418048
- yr_built       1 1.6391e+12 4.7856e+14 418061
- bathrooms      1 2.2930e+12 4.7922e+14 418085
- sqft_living15  1 2.6372e+12 4.7956e+14 418098
- date           1 3.0969e+12 4.8002e+14 418114
- floors         1 4.3875e+12 4.8131e+14 418161
- condition      4 4.9913e+12 4.8191e+14 418177
- bedrooms       1 1.1230e+13 4.8815e+14 418406
- sqft_above     1 1.7292e+13 4.9421e+14 418621
- view           1 2.2762e+13 4.9968e+14 418812
- sqft_living    1 3.9806e+13 5.1673e+14 419395
- waterfront     1 4.7657e+13 5.2458e+14 419657
- zipcode       69 3.0970e+14 7.8662e+14 426564
summary(mod1)

Call:
lm(formula = price ~ (date + bedrooms + bathrooms + sqft_living + 
    sqft_lot + floors + waterfront + view + condition + grade + 
    sqft_above + sqft_basement + yr_built + yr_renovated + zipcode + 
    lat + long + sqft_living15 + sqft_lot15) - sqft_basement - 
    grade, data = housedata)

Residuals:
     Min       1Q   Median       3Q      Max 
-1210171   -71962      875    65345  4208423 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -3.766e+07  7.072e+06  -5.324 1.03e-07 ***
date                1.190e+02  1.123e+01  10.597  < 2e-16 ***
bedrooms           -3.696e+04  1.832e+03 -20.180  < 2e-16 ***
bathrooms           2.771e+04  3.039e+03   9.119  < 2e-16 ***
sqft_living         1.531e+02  4.031e+00  37.994  < 2e-16 ***
sqft_lot            3.040e-01  4.483e-02   6.782 1.22e-11 ***
floors             -4.606e+04  3.652e+03 -12.614  < 2e-16 ***
waterfrontYes       6.722e+05  1.617e+04  41.572  < 2e-16 ***
view                5.776e+04  2.010e+03  28.730  < 2e-16 ***
conditionfair       1.011e+05  3.692e+04   2.740 0.006157 ** 
conditionaverage    9.938e+04  3.413e+04   2.912 0.003593 ** 
conditiongood       1.206e+05  3.413e+04   3.533 0.000412 ***
conditionvery good  1.663e+05  3.435e+04   4.842 1.30e-06 ***
sqft_above          1.032e+02  4.119e+00  25.041  < 2e-16 ***
yr_built           -2.359e+00  3.059e-01  -7.710 1.33e-14 ***
yr_renovated        2.006e+00  3.130e-01   6.410 1.49e-10 ***
zipcode98002        2.966e+04  1.675e+04   1.771 0.076570 .  
zipcode98003       -7.230e+03  1.482e+04  -0.488 0.625640    
zipcode98004        7.552e+05  2.711e+04  27.858  < 2e-16 ***
zipcode98005        2.694e+05  2.899e+04   9.292  < 2e-16 ***
zipcode98006        2.621e+05  2.361e+04  11.102  < 2e-16 ***
zipcode98007        2.223e+05  2.971e+04   7.484 7.56e-14 ***
zipcode98008        2.297e+05  2.836e+04   8.099 5.88e-16 ***
zipcode98010        9.409e+04  2.518e+04   3.736 0.000188 ***
zipcode98011        3.358e+04  3.709e+04   0.905 0.365312    
zipcode98014        7.001e+04  4.095e+04   1.710 0.087361 .  
zipcode98019        2.775e+04  3.988e+04   0.696 0.486515    
zipcode98022        5.426e+04  2.217e+04   2.447 0.014425 *  
zipcode98023       -4.581e+04  1.370e+04  -3.345 0.000826 ***
zipcode98024        1.558e+05  3.505e+04   4.446 8.81e-06 ***
zipcode98027        1.731e+05  2.432e+04   7.120 1.12e-12 ***
zipcode98028        1.893e+04  3.597e+04   0.526 0.598765    
zipcode98029        2.291e+05  2.769e+04   8.276  < 2e-16 ***
zipcode98030        2.604e+03  1.619e+04   0.161 0.872209    
zipcode98031        1.012e+04  1.698e+04   0.596 0.551208    
zipcode98032        3.762e+03  1.980e+04   0.190 0.849290    
zipcode98033        3.086e+05  3.080e+04  10.020  < 2e-16 ***
zipcode98034        1.263e+05  3.303e+04   3.825 0.000131 ***
zipcode98038        4.638e+04  1.826e+04   2.540 0.011087 *  
zipcode98039        1.191e+06  3.582e+04  33.249  < 2e-16 ***
zipcode98040        4.968e+05  2.423e+04  20.506  < 2e-16 ***
zipcode98042        1.644e+04  1.565e+04   1.050 0.293520    
zipcode98045        1.575e+05  3.384e+04   4.655 3.27e-06 ***
zipcode98052        1.853e+05  3.139e+04   5.903 3.64e-09 ***
zipcode98053        1.384e+05  3.353e+04   4.128 3.68e-05 ***
zipcode98055        2.000e+04  1.898e+04   1.053 0.292136    
zipcode98056        6.045e+04  2.064e+04   2.929 0.003409 ** 
zipcode98058        2.646e+04  1.789e+04   1.479 0.139216    
zipcode98059        5.807e+04  2.028e+04   2.863 0.004202 ** 
zipcode98065        7.522e+04  3.106e+04   2.422 0.015461 *  
zipcode98070       -7.983e+04  2.403e+04  -3.322 0.000895 ***
zipcode98072        7.586e+04  3.668e+04   2.068 0.038638 *  
zipcode98074        1.632e+05  2.964e+04   5.506 3.73e-08 ***
zipcode98075        1.574e+05  2.841e+04   5.541 3.05e-08 ***
zipcode98077        6.121e+04  3.830e+04   1.598 0.110053    
zipcode98092       -1.538e+04  1.464e+04  -1.051 0.293431    
zipcode98102        5.262e+05  3.235e+04  16.263  < 2e-16 ***
zipcode98103        2.691e+05  2.992e+04   8.994  < 2e-16 ***
zipcode98105        4.261e+05  3.061e+04  13.921  < 2e-16 ***
zipcode98106        8.902e+04  2.215e+04   4.019 5.86e-05 ***
zipcode98107        2.810e+05  3.071e+04   9.150  < 2e-16 ***
zipcode98108        7.561e+04  2.471e+04   3.059 0.002222 ** 
zipcode98109        4.446e+05  3.159e+04  14.072  < 2e-16 ***
zipcode98112        5.774e+05  2.805e+04  20.583  < 2e-16 ***
zipcode98115        2.634e+05  3.030e+04   8.694  < 2e-16 ***
zipcode98116        2.371e+05  2.466e+04   9.614  < 2e-16 ***
zipcode98117        2.440e+05  3.074e+04   7.937 2.19e-15 ***
zipcode98118        1.327e+05  2.150e+04   6.169 7.03e-10 ***
zipcode98119        4.405e+05  2.981e+04  14.774  < 2e-16 ***
zipcode98122        3.037e+05  2.672e+04  11.367  < 2e-16 ***
zipcode98125        1.157e+05  3.279e+04   3.528 0.000419 ***
zipcode98126        1.484e+05  2.284e+04   6.500 8.26e-11 ***
zipcode98133        6.772e+04  3.381e+04   2.003 0.045238 *  
zipcode98136        1.992e+05  2.320e+04   8.587  < 2e-16 ***
zipcode98144        2.427e+05  2.479e+04   9.793  < 2e-16 ***
zipcode98146        4.570e+04  2.071e+04   2.206 0.027372 *  
zipcode98148        4.276e+04  2.913e+04   1.468 0.142136    
zipcode98155        4.914e+04  3.525e+04   1.394 0.163369    
zipcode98166        1.335e+04  1.907e+04   0.700 0.483921    
zipcode98168        3.261e+04  2.014e+04   1.620 0.105324    
zipcode98177        1.261e+05  3.524e+04   3.578 0.000347 ***
zipcode98178        1.075e+03  2.055e+04   0.052 0.958296    
zipcode98188        2.898e+03  2.097e+04   0.138 0.890117    
zipcode98198       -2.719e+04  1.600e+04  -1.699 0.089311 .  
zipcode98199        3.069e+05  2.915e+04  10.529  < 2e-16 ***
lat                 2.443e+05  7.325e+04   3.335 0.000854 ***
long               -1.957e+05  5.212e+04  -3.754 0.000174 ***
sqft_living15       3.154e+01  3.226e+00   9.779  < 2e-16 ***
sqft_lot15         -1.362e-01  7.104e-02  -1.918 0.055191 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 166100 on 17295 degrees of freedom
Multiple R-squared:  0.7993,    Adjusted R-squared:  0.7982 
F-statistic: 782.5 on 88 and 17295 DF,  p-value: < 2.2e-16

Predicting with TEST data

PredictedPrice <- predict(mod1, newdata = housedataT)
head(PredictedPrice)
       1        2        3        4        5        6 
277523.1 857547.1 311085.6 604683.3 436416.6 536388.7 
formatSubmission <- data.frame(id = 1:4229, price = PredictedPrice)
head(formatSubmission)
  id    price
1  1 277523.1
2  2 857547.1
3  3 311085.6
4  4 604683.3
5  5 436416.6
6  6 536388.7
write.csv(formatSubmission, "Example.csv", row.names  = FALSE)

Predicting with Linear Model

#```{r}

#model_log <- lm(log(PredictedPrice)~., data = housedataT) #plot(model_log, which=2) #summary(model_log)

#```

Random Forest

library(caret)
set.seed(123)
myControl <- trainControl(method = "cv", number = 5)

mod_rf <- train(y = housedata$price, x = housedata[ , c(3:15, 17:20)], trControl = myControl, method = "ranger")
 summary(mod_rf)
                          Length Class         Mode     
predictions               17384  -none-        numeric  
num.trees                     1  -none-        numeric  
num.independent.variables     1  -none-        numeric  
mtry                          1  -none-        numeric  
min.node.size                 1  -none-        numeric  
prediction.error              1  -none-        numeric  
forest                        7  ranger.forest list     
splitrule                     1  -none-        character
treetype                      1  -none-        character
r.squared                     1  -none-        numeric  
call                          9  -none-        call     
importance.mode               1  -none-        character
num.samples                   1  -none-        numeric  
replace                       1  -none-        logical  
xNames                       17  -none-        character
problemType                   1  -none-        character
tuneValue                     3  data.frame    list     
obsLevels                     1  -none-        logical  
param                         0  -none-        list     

Backwards Elimination

library(caret)
myControl5 <- trainControl(method = "cv",
                           number = 10)
mod_BE <- train(price ~ sqft_living + sqft_basement, 
                data = housedata,
                trControl =myControl5,
                method = "lm")

summary(mod_BE)

Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
     Min       1Q   Median       3Q      Max 
-1519267  -148056   -22909   105354  4327691 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -43982.635   4979.516  -8.833  < 2e-16 ***
sqft_living      277.557      2.413 115.046  < 2e-16 ***
sqft_basement     20.612      5.003   4.120 3.81e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 262800 on 17381 degrees of freedom
Multiple R-squared:  0.4946,    Adjusted R-squared:  0.4945 
F-statistic:  8504 on 2 and 17381 DF,  p-value: < 2.2e-16

Download the Example.csv file and upload it to kaggle to be evaluated.