1. Preprocessing data and EDA (Exploratory Data Analysis)

# Read-in datasets
train <- read.csv("train.csv", na.strings = "N/A") # 23313 x 96
test <- read.csv("test.csv", na.strings = "N/A") # 5829 x 95

# Remove unnecessary variables and variables with >20 NA values
train <- train %>% dplyr::select(-listing_url, -scrape_id, -last_scraped, -name, -summary, -space, -description, -experiences_offered, -neighborhood_overview, -notes, -transit, -access, -interaction, -house_rules, -thumbnail_url, -medium_url, -picture_url, -xl_picture_url, -host_id, -host_url, -host_name, -host_since, -host_location, -host_about, -host_acceptance_rate, -host_thumbnail_url, -host_picture_url, -host_neighbourhood, -host_verifications, -street, -neighbourhood, -city, -state, -zipcode, -market, -smart_location, -country_code, -country, -square_feet, -weekly_price, -monthly_price, -security_deposit, -cleaning_fee, -calendar_updated, -calendar_last_scraped, -first_review, -last_review, -license, -jurisdiction_names, -host_response_time, -host_response_rate, -neighbourhood_cleansed, -requires_license, -has_availability)

# Re-categorizing variables whose the levels of categories are too small number of observations
train$property_type <- factor(ifelse(train$property_type == "Apartment", "Apartment", "Non-Apartment"))
test$property_type <- factor(ifelse(test$property_type == "Apartment", "Apartment", "Non-Apartment"))

train$room_type <- factor(train$room_type)
test$room_type <- factor(test$room_type)

train$bed_type <- factor(ifelse(train$bed_type == "Real Bed", "Real Bed", "Non-Real Bed"))
test$bed_type <- factor(ifelse(test$bed_type == "Real Bed", "Real Bed", "Non-Real Bed"))

train$cancellation_policy <- factor(ifelse(train$cancellation_policy %in% c("super_strict_30", "super_strict_60"), "strict", train$cancellation_policy))
test$cancellation_policy <- factor(ifelse(test$cancellation_policy %in% c("super_strict_30", "super_strict_60"), "strict", train$cancellation_policy))

train$neighbourhood_group_cleansed <- factor(ifelse(train$neighbourhood_group_cleansed == "Manhattan", "Manhattan", "Non-Manhattan"))
test$neighbourhood_group_cleansed <- factor(ifelse(test$neighbourhood_group_cleansed == "Manhattan", "Manhattan", "Non-Manhattan"))

# Make the True/False variable numeric
train$host_is_superhost <- ifelse(train$host_is_superhost == "t", 1, 0) 
test$host_is_superhost <- ifelse(test$host_is_superhost == "t", 1, 0) 

train$host_has_profile_pic <- ifelse(train$host_has_profile_pic == "t", 1, 0) 
test$host_has_profile_pic <- ifelse(test$host_has_profile_pic == "t", 1, 0) 

train$host_identity_verified <- ifelse(train$host_identity_verified == "t", 1, 0) 
test$host_identity_verified <- ifelse(test$host_identity_verified == "t", 1, 0) 

train$is_location_exact <- ifelse(train$is_location_exact == "t", 1, 0) 
test$is_location_exact <- ifelse(test$is_location_exact == "t", 1, 0) 

train$instant_bookable <- ifelse(train$instant_bookable == "t", 1, 0) 
test$instant_bookable <- ifelse(test$instant_bookable == "t", 1, 0) 

train$is_business_travel_ready <- ifelse(train$is_business_travel_ready == "t", 1, 0) 
test$is_business_travel_ready <- ifelse(test$is_business_travel_ready == "t", 1, 0) 

train$require_guest_profile_picture <- ifelse(train$require_guest_profile_picture == "t", 1, 0) 
test$require_guest_profile_picture <- ifelse(test$require_guest_profile_picture == "t", 1, 0) 

train$require_guest_phone_verification <- ifelse(train$require_guest_phone_verification == "t", 1, 0) 
test$require_guest_phone_verification <- ifelse(test$require_guest_phone_verification == "t", 1, 0) 

# Extracting amenities and count the number of amenities
amenity_num <- c()
trim <- function(x) gsub("^\\s+|\\s+$", "", x)
for (i in 1:length(train$amenities)) {
  amenity_num[i] <- trim(unlist(strsplit(train$amenities[i], ","))) %>% 
    unique %>%
    gsub('\\{|\\}|"\"', "", .) %>%
    length
}
train$amenities_num <- amenity_num

amenity_num <- c()
trim <- function(x) gsub("^\\s+|\\s+$", "", x)
for (i in 1:length(test$amenities)) {
  amenity_num[i] <- trim(unlist(strsplit(test$amenities[i], ","))) %>% 
    unique %>%
    gsub('\\{|\\}|"\"', "", .) %>%
    length
}
test$amenities_num <- amenity_num

# Removing 14 observations whose outcome (price) is missing
train <- na.omit(train)

price <- train$price
log.price <- log(price)

# Remove ID, price and amenities
train <- train %>% dplyr::select(-id, -price, -amenities)
train <- train %>% mutate(price, log.price)

# Remove observations with infinite values after log-transformation of price
train <- train[-which(train$log.price == -Inf),]

# Save the pre-processed dataset
list.airbnb <- list(train = train, test = test)
dput(list.airbnb, "airbnb_preprocessed")

# Load pre-processed dataset
list.airbnb <- dget("airbnb_preprocessed")
train <- list.airbnb$train 
test <- list.airbnb$test
price <- train$price
log.price <- train$log.price

# Visually checked the relationship among variables
# ggpairs(train)

# Outcome variable check using histogram and normal q-q plot
# png(paste0("./image/Fig_hist_qq.png"), res = 200, height = 6, width = 6, units = "in")
par(mfrow = c(2,2))
hist(train$price, breaks = "FD", xlab = "Price", main = "Histogram: Price")
qqnorm(train$price, pch = 1, frame = F, main = "Normal Q-Q plot: Price")
qqline(train$price, col = "red", lwd = 2)
hist(train$log.price, breaks = "FD", xlab = "log(Price)", main = "Histogram: log(Price)")
qqnorm(train$log.price, pch = 1, frame = F, main = "Normal Q-Q plot: log(Price)")
qqline(train$log.price, col = "red", lwd = 2)

# dev.off() %>% invisible()

2. Feature Selection

2-1. Univaraite Analysis

Univariate analysis was performed between log.price and each covariate in the dataset.

# Distinguish factor variables from continuous variables
fctr.vars <- rep(0, ncol(train))
fctr.idx <- which(names(train) %in% c("host_is_superhost", "host_has_profile_pic", "host_identity_verified", "neighbourhood_group_cleansed", "is_location_exact", "property_type", "room_type", "bed_type", "instant_bookable", "is_business_travel_ready", "require_guest_profile_picture", "require_guest_phone_verification", "cancellation_policy"))
fctr.vars[fctr.idx] <- 1

# Generate a summary table to save the pvalues and Pearson's correlation
smry.univar.df <- data.frame(Variables = names(train)[-c(41,42)], 
                             pval = NA,
                             correlation = NA)

# t-test / ANOVA and correlation check across the variables with the outcome
for (i in 1:(ncol(train)-2)) {
  if (fctr.vars[i] == 1) {
    if (length(levels(factor(train[,names(train)[i]]))) == 2) {
      grp1 <- levels(factor(train[,names(train)[i]]))[1]
      grp2 <- levels(factor(train[,names(train)[i]]))[2]
      pval.t <- t.test(train$log.price[which(train[,names(train)[i]] == grp1)],
                    train$log.price[which(train[,names(train)[i]] == grp2)])$p.val
      smry.univar.df[i,2] <- pval.t
    } else if (length(levels(factor(train[,names(train)[i]]))) > 2) {
      grp <- factor(train[,names(train)[i]])
      pval.anova <- anova(lm(train$log.price ~ grp))[1, "Pr(>F)"]
      smry.univar.df[i,2] <- pval.anova
    }
  } else {
    result <- cor.test(train$log.price, train[,names(train)[i]])
    pval.cor <- result$p.value
    cor.est <- result$estimate
    smry.univar.df[i,2] <- pval.cor
    smry.univar.df[i,3] <- cor.est
  }
}

# Formatting the p-values when the p-values were too small.
fpval.txt <- function (pval) {
  pval <- as.numeric(pval)
  if (pval < 0.001) 
    pval.txt <- "<0.001"
  else pval.txt <- as.character(pval)
  return(pval.txt)
}

# Formatting the univariate analysis summary table
smry.univar.df.real <- smry.univar.df
smry.univar.df$pval <- ifelse(smry.univar.df$pval < 0.001, "<0.001", as.character(round(smry.univar.df$pval,3)))
smry.univar.df$correlation <- round(smry.univar.df$correlation, 3)

smry.univar.df.print <- smry.univar.df[which(smry.univar.df.real$pval < 0.05),] %>%
  arrange(desc(correlation))

train <- train[,c(smry.univar.df$Variables, "log.price")]
regulartable(smry.univar.df.print) %>% autofit() #%>% save_as_image(., "./image/Fig_univariate.png")

Variables	pval	correlation
accommodates	<0.001	0.538
beds	<0.001	0.424
guests_included	<0.001	0.357
bedrooms	<0.001	0.345
review_scores_location	<0.001	0.202
amenities_num	<0.001	0.182
bathrooms	<0.001	0.137
extra_people	<0.001	0.126
review_scores_cleanliness	<0.001	0.094
review_scores_rating	<0.001	0.078
latitude	<0.001	0.073
review_scores_accuracy	<0.001	0.050
review_scores_communication	<0.001	0.042
review_scores_checkin	<0.001	0.031
availability_365	<0.001	0.024
number_of_reviews	0.003	0.019
availability_30	<0.001	-0.026
reviews_per_month	<0.001	-0.044
availability_60	<0.001	-0.048
availability_90	<0.001	-0.057
calculated_host_listings_count	<0.001	-0.157
longitude	<0.001	-0.339
host_is_superhost	0.028
host_identity_verified	<0.001
neighbourhood_group_cleansed	<0.001
is_location_exact	<0.001
property_type	<0.001
room_type	<0.001
bed_type	<0.001
instant_bookable	<0.001
is_business_travel_ready	<0.001
cancellation_policy	<0.001
require_guest_phone_verification	0.013

2-2. LASSO

# Dumy code categorical predictor variables
x_vars <- model.matrix(price ~ ., train[,-41])
y_var <- train$log.price
lambda_seq <- 10^seq(2, -2, by = -.1)

# 5-fold cross validation to determine the best lambda
set.seed(123)
cv_output <- cv.glmnet(x_vars, y_var, alpha = 1, lambda = lambda_seq, nfolds = 5)
best_lam <- cv_output$lambda.min

# Rebuilding the model with the best lambda value identified
lasso_best <- glmnet(x_vars, y_var, alpha = 1, lambda = best_lam)
# coef(lasso_best, best_lam)

# 23 features + outcome variable (total 24 variables) selected in the train set
best.vars <- c("log.price", "host_is_superhost", "host_identity_verified", "neighbourhood_group_cleansed", "longitude",
               "property_type", "room_type", "accommodates", "bathrooms", "bedrooms", "guests_included", 
               "extra_people", "minimum_nights", "availability_30", "availability_365", "review_scores_rating", 
               "review_scores_cleanliness", "review_scores_location", "review_scores_value", "instant_bookable", 
               "is_business_travel_ready", "calculated_host_listings_count", "reviews_per_month", "amenities_num")
# length(best.vars) # 24
train <- train[,best.vars] %>% as.data.frame()

3. Predictive Modeling

3-1. Multiple linear regression

mod <- lm(formula = log.price ~ ., data = train[,-1])
predicted.prices <- predict.lm(object = mod, newdata = test[,-1])
submission <- data.frame(id = test$id, price = exp(predicted.prices))
write.csv(x = submission, file = "Model 1 Predictions_MultipleLinearRegression.csv", row.names = FALSE)

predicted.prices_train <- predict.lm(object = mod, newdata = train[,-1])
rmse.lr <- RMSE(predicted.prices_train, train$log.price) %>% round(3)
r2.lr <- R2(predicted.prices_train, train$log.price) %>% round(3)

3-2. Tree with tuning

trControl = trainControl(method='cv',number = 5)
tuneGrid = expand.grid(.cp = seq(from = 0.001,to = 0.1,by = 0.001))
set.seed(123)
cvModel = train(log.price~., data=train, method="rpart", trControl = trControl, tuneGrid = tuneGrid)
# cvModel$results %>% arrange(RMSE)
cvTree = rpart(log.price ~ ., data = train, cp = cvModel$bestTune$cp)
pred = predict(cvTree, newdata = test)
submission <- data.frame(id = test$id, price = exp(pred))
write.csv(x = submission, file = "Model 1 Predictions_TreeTuning.csv", row.names = FALSE)

pred_train <- predict(cvTree, newdata = train)
rmse.tr <- RMSE(pred_train, train$log.price)
r2.tr <- R2(pred_train, train$log.price)

3-3. Random Forest

set.seed(123)
forest = randomForest(log.price ~ ., data = train, ntree = 1000)
pred = predict(forest,newdata=test)
submission <- data.frame(id = test$id, price = exp(pred))
write.csv(x = submission, file = "Model 1 Predictions_RandomForest.csv", row.names = FALSE)

# Plot the dot chart by importance of variables using Random Forest
# png(paste0("./image/Fig_RF_varimpplot.png"), res = 150, height = 6, width = 8, units = "in")
varImpPlot(forest, main = "Random Forest (ntree = 1000)")

# dev.off() %>% invisible()

pred_train_rf <- predict(forest, newdata = train)
rmse.rf <- RMSE(pred_train_rf, train$log.price)
r2.rf <- R2(pred_train_rf, train$log.price)

3-4. Boosting Model

set.seed(123)
boost = gbm(log.price ~ ., data=train, distribution="gaussian", n.trees = 500, interaction.depth = 2, shrinkage = 0.01)
# summary(boost, plot = F)
smry.bst <- data.frame(vars = summary(boost, plot = F)$var, 
                  rel.inf = summary(boost, plot = F)$rel.inf) %>% filter(rel.inf > 0.1) %>% arrange(desc(rel.inf))
smry.bst$vars <- factor(smry.bst$vars, levels = rev(smry.bst$vars))

# Plot the dot chart by importance of variables using Boosting model
# png(paste0("./image/Fig_Boosting.png"), res = 200, height = 4, width = 8, units = "in")
ggplot(smry.bst,
       aes(x = rel.inf, y = vars)) + 
  geom_point(size = 3) + 
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face = "bold"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.y = element_line(colour = "grey60", linetype = "dashed")
  ) + labs(title = "Boosting Model") + xlab("Relative Influence") + ylab("Variables")

# dev.off() %>% invisible()

pred = predict(boost,n.trees = 500, newdata = test)
submission <- data.frame(id = test$id, price = exp(pred))
write.csv(x = submission, file = "Model 1 Predictions_Boosting.csv", row.names = FALSE)

pred_train_bst <- predict(boost, n.trees = 500, newdata = train)
rmse.bst <- RMSE(pred_train_bst, train$log.price)
r2.bst <- R2(pred_train_bst, train$log.price)

3-5. Summary Table of Model Performance

smry.tbl <- data.frame(Models = c("Multiple Linear Regression", "Decision Tree with Tuning", "Random Forest", "Boosting Model"),
           RMSE = c(rmse.lr, rmse.tr, rmse.rf, rmse.bst) %>% round(3),
           `R2` = c(r2.lr, r2.tr, r2.rf, r2.bst) %>% round(3))
smry.tbl %>% regulartable() %>% autofit() #%>% save_as_image(., "./image/Fig_RMSEsmry.png")

Models	RMSE	R2
Multiple Linear Regression	0.415	0.622
Decision Tree with Tuning	0.418	0.616
Random Forest	0.175	0.942
Boosting Model	0.414	0.633

AirBnB Price Prediction Modeling in NYC

Jisu Baek

1. Preprocessing data and EDA (Exploratory Data Analysis)

2. Feature Selection

2-1. Univaraite Analysis

2-2. LASSO

3. Predictive Modeling

3-1. Multiple linear regression

3-2. Tree with tuning

3-3. Random Forest

3-4. Boosting Model

3-5. Summary Table of Model Performance