本日重點:用第一天學到的資料選取技巧,運用視覺化、基本的統計等工具,來「看」一下房價資料,哪些欄位重要。
##設定環境
#setwd(dir) #設定working directory的存放位置
# MAC : setwd("/Users/rladiestaipei/R_DragonBall/")
# Windows : setwd("C://Users/rladiestaipei/Desktop/R_DragonBall/")
#安裝套件(僅需執行一次)
#install.packages(c("tidyverse", "ploty", "zoo", "lubridate", "rmarkdown","data.table", "DT", "kableExtra"), dependencies = TRUE)
#load packages
library(tidyverse)
library(ggplot2)
library(dplyr)
library(magrittr)
library(corrplot)
library(gridExtra)
library(plotly)
options(dplyr.print_max=1e9)
資料特性
開始進行,資料特性大致上分下列幾種:
# 讀取資料
train0 <- read.csv("train.csv", stringsAsFactors = FALSE)
test0 <- read.csv("test.csv", stringsAsFactors = FALSE)
# 分割 numeric and character 欄位
num_features <- names(which(sapply(train0, is.numeric)))
cat_features <- names(which(sapply(train0, is.character)))
train_numeric <- train0[, names(train0) %in% num_features]
train_categoric <- train0[, names(train0) %in% cat_features]
print(num_features)
[1] "Id" "MSSubClass" "LotFrontage" "LotArea" "OverallQual" "OverallCond"
[7] "YearBuilt" "YearRemodAdd" "MasVnrArea" "BsmtFinSF1" "BsmtFinSF2" "BsmtUnfSF"
[13] "TotalBsmtSF" "X1stFlrSF" "X2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
[19] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr" "KitchenAbvGr" "TotRmsAbvGrd"
[25] "Fireplaces" "GarageYrBlt" "GarageCars" "GarageArea" "WoodDeckSF" "OpenPorchSF"
[31] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea" "MiscVal" "MoSold"
[37] "YrSold" "SalePrice"
print(cat_features)
[1] "MSZoning" "Street" "Alley" "LotShape" "LandContour" "Utilities"
[7] "LotConfig" "LandSlope" "Neighborhood" "Condition1" "Condition2" "BldgType"
[13] "HouseStyle" "RoofStyle" "RoofMatl" "Exterior1st" "Exterior2nd" "MasVnrType"
[19] "ExterQual" "ExterCond" "Foundation" "BsmtQual" "BsmtCond" "BsmtExposure"
[25] "BsmtFinType1" "BsmtFinType2" "Heating" "HeatingQC" "CentralAir" "Electrical"
[31] "KitchenQual" "Functional" "FireplaceQu" "GarageType" "GarageFinish" "GarageQual"
[37] "GarageCond" "PavedDrive" "PoolQC" "Fence" "MiscFeature" "SaleType"
[43] "SaleCondition"
36個欄位是數字
、有 43個欄位是類別
,再加上 Id 和 SalePrice.categorical
的資料變成 Numenical
?null
或 NA
的數量有多少?方法有 3 種
NA
)連續型數值
標準化,但需視後面的模型而定,通常 Regression model
有差,但是 Tree-based model
沒有差。(第三天的 Feature Engineering 會告訴你如何做,別急!)# 先看num_features的36個欄位
# MSSubClass(住宅類型)、OverallQual(材料與完成度評比)、OverallCond(綜合狀況評比) 很明確屬於factor,先轉換。
train0$OverallCond <- as.factor(train0$OverallCond)
train0$OverallQual <- as.factor(train0$OverallQual)
train0$MSSubClass <- as.factor(train0$MSSubClass)
# 查看每個欄位的缺失值和比例
missing_values <- sapply(train0, function(x) sum(is.na(x)))
null_count <- data.frame(Count = missing_values, Proportion = missing_values/nrow(train0))
null_count_gteZero <- null_count[null_count$Count > 0, ]
null_count_gteZero[order(-null_count_gteZero$Count),]
Count Proportion
PoolQC 1453 0.9952054795
MiscFeature 1406 0.9630136986
Alley 1369 0.9376712329
Fence 1179 0.8075342466
FireplaceQu 690 0.4726027397
LotFrontage 259 0.1773972603
GarageType 81 0.0554794521
GarageYrBlt 81 0.0554794521
GarageFinish 81 0.0554794521
GarageQual 81 0.0554794521
GarageCond 81 0.0554794521
BsmtExposure 38 0.0260273973
BsmtFinType2 38 0.0260273973
BsmtQual 37 0.0253424658
BsmtCond 37 0.0253424658
BsmtFinType1 37 0.0253424658
MasVnrType 8 0.0054794521
MasVnrArea 8 0.0054794521
Electrical 1 0.0006849315
# 刪除所有出現NA的欄位
train_non_null <- train0 %>%
select(-c(rownames(null_count_gteZero), OverallCond, OverallQual, MSSubClass))
NA
,其中有 5 個欄位的缺失值比例超過 50%。# 先剔除出現NA的欄位,而且是數字的欄位
match_num_features <- paste(num_features, collapse = "|")
train_non_null_df <- select(train_non_null, matches(match_num_features))
學習用
ggplot()
+tidyr package
,快速畫圖
for loop
+ par(mfrow = ...)
,程式寫完、跑完,那一天大概也過完了。library(tidyr)
的 gather()
搭配facet_wrap()
, 可以快速幫你解決這個問題(沒有業配XD)。他可以將欄位分成 key
跟 value
兩種。重新組合成一個 長 的格式,可以用 key
跟 value
選擇要保留下來的欄位。library(purrr)
的 keep()
選擇需要的變量。示範用
面積
相關的欄位
theme_set(theme_bw()) # pre-set the bw theme.
# 篩選SF(面積)的欄位
train_SF <- select(train_non_null, matches("SF|SalePrice"))
# 各種面積與SalePrice的關係
train_SF %>%
# keep(is.numeric) %>%
gather(-SalePrice, key = "var", value = "value") %>%
ggplot(aes(x = value, y = SalePrice)) + # Plot the values
facet_wrap(~ var, scales = "free") + # In separate panels
geom_point()
LowQualFinSF
之外,其餘幾乎都呈正比。示範用
時間
相關的欄位
# 篩選SF(面積)的欄位
train_Time <- select(train_non_null, matches("Yr|Year|Mo|year|yr|SalePrice"))
train_Time %>%
gather(-SalePrice, key = "var", value = "value") %>%
ggplot(aes(x = value, y = SalePrice)) + # Plot the values
facet_wrap(~ var, scales = "free") + # In separate panels
geom_point()
示範用
區域空間
相關的欄位
# 篩選Area(區域空間)的欄位
train_Area <- select(train_non_null, matches("Area|SalePrice"))
train_Area %>%
gather(-SalePrice, key = "var", value = "value") %>%
ggplot(aes(x = value, y = SalePrice)) + # Plot the values
facet_wrap(~ var, scales = "free") + # In separate panels
geom_point()
feature
候選人: X1stFlrSF
, TotalBsmtSF
, YearBuilt
, GrLivArea
, LotArea
。# 先剔除出現NA的欄位,而且是數字的欄位
match_cat_features <- paste(cat_features, collapse = "|")
train_non_null_df <- select(train_non_null, matches(match_num_features))
示範用
房屋外觀
相關的欄位
# 篩選房屋外觀的欄位
train_outside <- select(train_non_null, matches("Roof|MSSubClass|LotShape|Exterior|SalePrice"))
# 各種房屋外觀與SalePrice的關係
train_outside %>%
gather(-SalePrice, key = "var", value = "value") %>%
ggplot(aes(x = value, y = SalePrice)) + # Plot the values
facet_wrap(~ var, scales = "free") + # In separate panels
geom_boxplot(na.rm = T) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
示範用
房屋內部
相關的欄位
# 篩選房屋內部的欄位
train_inside <- select(train_non_null, matches("BldgType|Utilities|House|Bsmt|TotRmsAbvGrd|Fireplace|SalePrice"))
train_inside %>%
gather(-SalePrice, key = "var", value = "value") %>%
ggplot(aes(x = value, y = SalePrice)) + # Plot the values
facet_wrap(~ var, scales = "free") + # In separate panels
geom_boxplot(na.rm = T) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
示範用
其他空間
相關的欄位
# 篩選Area(區域空間)的欄位
train_other <- select(train_non_null, matches("Electrical|Neighborhood|Street|Garage|MSZoning|SalePrice"))
train_other %>%
gather(-SalePrice, key = "var", value = "value") %>%
ggplot(aes(x = value, y = SalePrice)) + # Plot the values
facet_wrap(~ var, scales = "free") + # In separate panels
geom_boxplot(na.rm = T) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Neighborhood
影響房價佔有很大的因素,其中最昂貴的房子在 NoRidge(諾里奇)
,是芝加哥北邊的一個村莊。correlations <- cor(train_non_null_df, use = "complete.obs")
cor_bar <- data.frame("cor" = correlations[,"SalePrice"])
cor_bar$item <- row.names(cor_bar)
cor_bar <- cor_bar[order(-cor_bar$cor),][-1,]
cor_bar$item <- factor(cor_bar$item, levels=cor_bar$item)
ggplot(cor_bar) +
geom_bar(stat='identity', aes(x = item, y = cor), width=.5) +
labs(title= "Correlations Bars") +
coord_flip()
GrLivArea: 0.71
, GarageCars: 0.64
, GarageArea: 0.62
。 Min. 1st Qu. Median Mean 3rd Qu. Max.
34900 129975 163000 180921 214000 755000
log()
後對模型訓練也許有幫助。 Min. 1st Qu. Median Mean 3rd Qu. Max.
10.46 11.78 12.00 12.02 12.27 13.53
請挑選 training data 中你覺得其他重要的欄位,和房價有什麼關係。