1. Preprocessing data and EDA (Exploratory Data Analysis)

# Read-in datasets
train <- read.csv("train.csv", na.strings = "N/A") # 23313 x 96
test <- read.csv("test.csv", na.strings = "N/A") # 5829 x 95

# Remove unnecessary variables and variables with >20 NA values
train <- train %>% dplyr::select(-listing_url, -scrape_id, -last_scraped, -name, -summary, -space, -description, -experiences_offered, -neighborhood_overview, -notes, -transit, -access, -interaction, -house_rules, -thumbnail_url, -medium_url, -picture_url, -xl_picture_url, -host_id, -host_url, -host_name, -host_since, -host_location, -host_about, -host_acceptance_rate, -host_thumbnail_url, -host_picture_url, -host_neighbourhood, -host_verifications, -street, -neighbourhood, -city, -state, -zipcode, -market, -smart_location, -country_code, -country, -square_feet, -weekly_price, -monthly_price, -security_deposit, -cleaning_fee, -calendar_updated, -calendar_last_scraped, -first_review, -last_review, -license, -jurisdiction_names, -host_response_time, -host_response_rate, -neighbourhood_cleansed, -requires_license, -has_availability)

# Re-categorizing variables whose the levels of categories are too small number of observations
train$property_type <- factor(ifelse(train$property_type == "Apartment", "Apartment", "Non-Apartment"))
test$property_type <- factor(ifelse(test$property_type == "Apartment", "Apartment", "Non-Apartment"))

train$room_type <- factor(train$room_type)
test$room_type <- factor(test$room_type)

train$bed_type <- factor(ifelse(train$bed_type == "Real Bed", "Real Bed", "Non-Real Bed"))
test$bed_type <- factor(ifelse(test$bed_type == "Real Bed", "Real Bed", "Non-Real Bed"))

train$cancellation_policy <- factor(ifelse(train$cancellation_policy %in% c("super_strict_30", "super_strict_60"), "strict", train$cancellation_policy))
test$cancellation_policy <- factor(ifelse(test$cancellation_policy %in% c("super_strict_30", "super_strict_60"), "strict", train$cancellation_policy))

train$neighbourhood_group_cleansed <- factor(ifelse(train$neighbourhood_group_cleansed == "Manhattan", "Manhattan", "Non-Manhattan"))
test$neighbourhood_group_cleansed <- factor(ifelse(test$neighbourhood_group_cleansed == "Manhattan", "Manhattan", "Non-Manhattan"))

# Make the True/False variable numeric
train$host_is_superhost <- ifelse(train$host_is_superhost == "t", 1, 0) 
test$host_is_superhost <- ifelse(test$host_is_superhost == "t", 1, 0) 

train$host_has_profile_pic <- ifelse(train$host_has_profile_pic == "t", 1, 0) 
test$host_has_profile_pic <- ifelse(test$host_has_profile_pic == "t", 1, 0) 

train$host_identity_verified <- ifelse(train$host_identity_verified == "t", 1, 0) 
test$host_identity_verified <- ifelse(test$host_identity_verified == "t", 1, 0) 

train$is_location_exact <- ifelse(train$is_location_exact == "t", 1, 0) 
test$is_location_exact <- ifelse(test$is_location_exact == "t", 1, 0) 

train$instant_bookable <- ifelse(train$instant_bookable == "t", 1, 0) 
test$instant_bookable <- ifelse(test$instant_bookable == "t", 1, 0) 

train$is_business_travel_ready <- ifelse(train$is_business_travel_ready == "t", 1, 0) 
test$is_business_travel_ready <- ifelse(test$is_business_travel_ready == "t", 1, 0) 

train$require_guest_profile_picture <- ifelse(train$require_guest_profile_picture == "t", 1, 0) 
test$require_guest_profile_picture <- ifelse(test$require_guest_profile_picture == "t", 1, 0) 

train$require_guest_phone_verification <- ifelse(train$require_guest_phone_verification == "t", 1, 0) 
test$require_guest_phone_verification <- ifelse(test$require_guest_phone_verification == "t", 1, 0) 

# Extracting amenities and count the number of amenities
amenity_num <- c()
trim <- function(x) gsub("^\\s+|\\s+$", "", x)
for (i in 1:length(train$amenities)) {
  amenity_num[i] <- trim(unlist(strsplit(train$amenities[i], ","))) %>% 
    unique %>%
    gsub('\\{|\\}|"\"', "", .) %>%
    length
}
train$amenities_num <- amenity_num

amenity_num <- c()
trim <- function(x) gsub("^\\s+|\\s+$", "", x)
for (i in 1:length(test$amenities)) {
  amenity_num[i] <- trim(unlist(strsplit(test$amenities[i], ","))) %>% 
    unique %>%
    gsub('\\{|\\}|"\"', "", .) %>%
    length
}
test$amenities_num <- amenity_num

# Removing 14 observations whose outcome (price) is missing
train <- na.omit(train)

price <- train$price
log.price <- log(price)

# Remove ID, price and amenities
train <- train %>% dplyr::select(-id, -price, -amenities)
train <- train %>% mutate(price, log.price)

# Remove observations with infinite values after log-transformation of price
train <- train[-which(train$log.price == -Inf),]

# Save the pre-processed dataset
list.airbnb <- list(train = train, test = test)
dput(list.airbnb, "airbnb_preprocessed")
# Load pre-processed dataset
list.airbnb <- dget("airbnb_preprocessed")
train <- list.airbnb$train 
test <- list.airbnb$test
price <- train$price
log.price <- train$log.price

# Visually checked the relationship among variables
# ggpairs(train)

# Outcome variable check using histogram and normal q-q plot
# png(paste0("./image/Fig_hist_qq.png"), res = 200, height = 6, width = 6, units = "in")
par(mfrow = c(2,2))
hist(train$price, breaks = "FD", xlab = "Price", main = "Histogram: Price")
qqnorm(train$price, pch = 1, frame = F, main = "Normal Q-Q plot: Price")
qqline(train$price, col = "red", lwd = 2)
hist(train$log.price, breaks = "FD", xlab = "log(Price)", main = "Histogram: log(Price)")
qqnorm(train$log.price, pch = 1, frame = F, main = "Normal Q-Q plot: log(Price)")
qqline(train$log.price, col = "red", lwd = 2)

# dev.off() %>% invisible()

2. Feature Selection

2-1. Univaraite Analysis

Univariate analysis was performed between log.price and each covariate in the dataset.

# Distinguish factor variables from continuous variables
fctr.vars <- rep(0, ncol(train))
fctr.idx <- which(names(train) %in% c("host_is_superhost", "host_has_profile_pic", "host_identity_verified", "neighbourhood_group_cleansed", "is_location_exact", "property_type", "room_type", "bed_type", "instant_bookable", "is_business_travel_ready", "require_guest_profile_picture", "require_guest_phone_verification", "cancellation_policy"))
fctr.vars[fctr.idx] <- 1

# Generate a summary table to save the pvalues and Pearson's correlation
smry.univar.df <- data.frame(Variables = names(train)[-c(41,42)], 
                             pval = NA,
                             correlation = NA)

# t-test / ANOVA and correlation check across the variables with the outcome
for (i in 1:(ncol(train)-2)) {
  if (fctr.vars[i] == 1) {
    if (length(levels(factor(train[,names(train)[i]]))) == 2) {
      grp1 <- levels(factor(train[,names(train)[i]]))[1]
      grp2 <- levels(factor(train[,names(train)[i]]))[2]
      pval.t <- t.test(train$log.price[which(train[,names(train)[i]] == grp1)],
                    train$log.price[which(train[,names(train)[i]] == grp2)])$p.val
      smry.univar.df[i,2] <- pval.t
    } else if (length(levels(factor(train[,names(train)[i]]))) > 2) {
      grp <- factor(train[,names(train)[i]])
      pval.anova <- anova(lm(train$log.price ~ grp))[1, "Pr(>F)"]
      smry.univar.df[i,2] <- pval.anova
    }
  } else {
    result <- cor.test(train$log.price, train[,names(train)[i]])
    pval.cor <- result$p.value
    cor.est <- result$estimate
    smry.univar.df[i,2] <- pval.cor
    smry.univar.df[i,3] <- cor.est
  }
}

# Formatting the p-values when the p-values were too small.
fpval.txt <- function (pval) {
  pval <- as.numeric(pval)
  if (pval < 0.001) 
    pval.txt <- "<0.001"
  else pval.txt <- as.character(pval)
  return(pval.txt)
}

# Formatting the univariate analysis summary table
smry.univar.df.real <- smry.univar.df
smry.univar.df$pval <- ifelse(smry.univar.df$pval < 0.001, "<0.001", as.character(round(smry.univar.df$pval,3)))
smry.univar.df$correlation <- round(smry.univar.df$correlation, 3)

smry.univar.df.print <- smry.univar.df[which(smry.univar.df.real$pval < 0.05),] %>%
  arrange(desc(correlation))

train <- train[,c(smry.univar.df$Variables, "log.price")]
regulartable(smry.univar.df.print) %>% autofit() #%>% save_as_image(., "./image/Fig_univariate.png")

2-2. LASSO

# Dumy code categorical predictor variables
x_vars <- model.matrix(price ~ ., train[,-41])
y_var <- train$log.price
lambda_seq <- 10^seq(2, -2, by = -.1)

# 5-fold cross validation to determine the best lambda
set.seed(123)
cv_output <- cv.glmnet(x_vars, y_var, alpha = 1, lambda = lambda_seq, nfolds = 5)
best_lam <- cv_output$lambda.min

# Rebuilding the model with the best lambda value identified
lasso_best <- glmnet(x_vars, y_var, alpha = 1, lambda = best_lam)
# coef(lasso_best, best_lam)

# 23 features + outcome variable (total 24 variables) selected in the train set
best.vars <- c("log.price", "host_is_superhost", "host_identity_verified", "neighbourhood_group_cleansed", "longitude",
               "property_type", "room_type", "accommodates", "bathrooms", "bedrooms", "guests_included", 
               "extra_people", "minimum_nights", "availability_30", "availability_365", "review_scores_rating", 
               "review_scores_cleanliness", "review_scores_location", "review_scores_value", "instant_bookable", 
               "is_business_travel_ready", "calculated_host_listings_count", "reviews_per_month", "amenities_num")
# length(best.vars) # 24
train <- train[,best.vars] %>% as.data.frame()

3. Predictive Modeling

3-1. Multiple linear regression

mod <- lm(formula = log.price ~ ., data = train[,-1])
predicted.prices <- predict.lm(object = mod, newdata = test[,-1])
submission <- data.frame(id = test$id, price = exp(predicted.prices))
write.csv(x = submission, file = "Model 1 Predictions_MultipleLinearRegression.csv", row.names = FALSE)

predicted.prices_train <- predict.lm(object = mod, newdata = train[,-1])
rmse.lr <- RMSE(predicted.prices_train, train$log.price) %>% round(3)
r2.lr <- R2(predicted.prices_train, train$log.price) %>% round(3)

3-2. Tree with tuning

trControl = trainControl(method='cv',number = 5)
tuneGrid = expand.grid(.cp = seq(from = 0.001,to = 0.1,by = 0.001))
set.seed(123)
cvModel = train(log.price~., data=train, method="rpart", trControl = trControl, tuneGrid = tuneGrid)
# cvModel$results %>% arrange(RMSE)
cvTree = rpart(log.price ~ ., data = train, cp = cvModel$bestTune$cp)
pred = predict(cvTree, newdata = test)
submission <- data.frame(id = test$id, price = exp(pred))
write.csv(x = submission, file = "Model 1 Predictions_TreeTuning.csv", row.names = FALSE)

pred_train <- predict(cvTree, newdata = train)
rmse.tr <- RMSE(pred_train, train$log.price)
r2.tr <- R2(pred_train, train$log.price)

3-3. Random Forest

set.seed(123)
forest = randomForest(log.price ~ ., data = train, ntree = 1000)
pred = predict(forest,newdata=test)
submission <- data.frame(id = test$id, price = exp(pred))
write.csv(x = submission, file = "Model 1 Predictions_RandomForest.csv", row.names = FALSE)

# Plot the dot chart by importance of variables using Random Forest
# png(paste0("./image/Fig_RF_varimpplot.png"), res = 150, height = 6, width = 8, units = "in")
varImpPlot(forest, main = "Random Forest (ntree = 1000)")

# dev.off() %>% invisible()

pred_train_rf <- predict(forest, newdata = train)
rmse.rf <- RMSE(pred_train_rf, train$log.price)
r2.rf <- R2(pred_train_rf, train$log.price)

3-4. Boosting Model

set.seed(123)
boost = gbm(log.price ~ ., data=train, distribution="gaussian", n.trees = 500, interaction.depth = 2, shrinkage = 0.01)
# summary(boost, plot = F)
smry.bst <- data.frame(vars = summary(boost, plot = F)$var, 
                  rel.inf = summary(boost, plot = F)$rel.inf) %>% filter(rel.inf > 0.1) %>% arrange(desc(rel.inf))
smry.bst$vars <- factor(smry.bst$vars, levels = rev(smry.bst$vars))

# Plot the dot chart by importance of variables using Boosting model
# png(paste0("./image/Fig_Boosting.png"), res = 200, height = 4, width = 8, units = "in")
ggplot(smry.bst,
       aes(x = rel.inf, y = vars)) + 
  geom_point(size = 3) + 
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, size = 15, face = "bold"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.y = element_line(colour = "grey60", linetype = "dashed")
  ) + labs(title = "Boosting Model") + xlab("Relative Influence") + ylab("Variables")

# dev.off() %>% invisible()

pred = predict(boost,n.trees = 500, newdata = test)
submission <- data.frame(id = test$id, price = exp(pred))
write.csv(x = submission, file = "Model 1 Predictions_Boosting.csv", row.names = FALSE)

pred_train_bst <- predict(boost, n.trees = 500, newdata = train)
rmse.bst <- RMSE(pred_train_bst, train$log.price)
r2.bst <- R2(pred_train_bst, train$log.price)

3-5. Summary Table of Model Performance

smry.tbl <- data.frame(Models = c("Multiple Linear Regression", "Decision Tree with Tuning", "Random Forest", "Boosting Model"),
           RMSE = c(rmse.lr, rmse.tr, rmse.rf, rmse.bst) %>% round(3),
           `R2` = c(r2.lr, r2.tr, r2.rf, r2.bst) %>% round(3))
smry.tbl %>% regulartable() %>% autofit() #%>% save_as_image(., "./image/Fig_RMSEsmry.png")