setwd("C:/Users/Thomas/Coursera/Stats1")
library(ggplot2)
library(dplyr)
library(statsr)
library(rvest)
library(leaps)
library(BAS)
load("movies.Rdata")
# feature_film: "yes" if title_type is Feature Film, "no" otherwise
movies$feature_film <- as.factor(ifelse(movies$title_type == 'Feature Film',"yes","no"))
# drama: "yes" if genre is Drama, "no" otherwise
movies$drama <- as.factor(ifelse(movies$genre == 'Drama',"yes","no"))
# mpaa_rating_R: "yes" if mpaa_rating is R, "no" otherwise
movies$mpaa_rating_R <- as.factor(ifelse(movies$mpaa_rating == 'R',"yes","no"))
# oscar_season: "yes" if movie is released in November, October, or December (based on thtr_rel_month), "no" otherwise
movies$oscar_season <- as.factor(ifelse(movies$thtr_rel_month == 10, "yes",
ifelse(movies$thtr_rel_month == 11, "yes",
ifelse(movies$thtr_rel_month == 12, "yes", "no"))))
# summer_season: "yes" if movie is released in May, June, July, or August (based on thtr_rel_month), "no" otherwise
movies$summer_season <- as.factor(ifelse(movies$thtr_rel_month == 5, "yes",
ifelse(movies$thtr_rel_month == 6, "yes",
ifelse(movies$thtr_rel_month == 7, "yes",
ifelse(movies$thtr_rel_month == 8, "yes", "no")))))
# Keep just columns for model
movies <- movies[,c("audience_score","feature_film","drama","runtime","mpaa_rating_R",
"thtr_rel_year","oscar_season","summer_season","imdb_rating","imdb_num_votes",
"critics_score","best_pic_nom","best_pic_win","best_actor_win",
"best_actress_win","best_dir_win","top200_box")]
movies <- movies[,c("audience_score","feature_film","drama","runtime","mpaa_rating_R",
"thtr_rel_year","oscar_season","summer_season","imdb_rating","imdb_num_votes",
"critics_score","best_pic_nom","best_pic_win","best_actor_win",
"best_actress_win","best_dir_win","top200_box")]
# Keep only compete sets of data
movies <- movies[complete.cases(movies),]
str(movies)
## Classes 'tbl_df', 'tbl' and 'data.frame': 650 obs. of 17 variables:
## $ audience_score : num 73 81 91 76 27 86 76 47 89 66 ...
## $ feature_film : Factor w/ 2 levels "no","yes": 2 2 2 2 2 1 2 2 1 2 ...
## $ drama : Factor w/ 2 levels "no","yes": 2 2 1 2 1 1 2 2 1 2 ...
## $ runtime : num 80 101 84 139 90 78 142 93 88 119 ...
## $ mpaa_rating_R : Factor w/ 2 levels "no","yes": 2 1 2 1 2 1 1 2 1 1 ...
## $ thtr_rel_year : num 2013 2001 1996 1993 2004 ...
## $ oscar_season : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 2 1 1 ...
## $ summer_season : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 1 1 1 1 ...
## $ imdb_rating : num 5.5 7.3 7.6 7.2 5.1 7.8 7.2 5.5 7.5 6.6 ...
## $ imdb_num_votes : int 899 12285 22381 35096 2386 333 5016 2272 880 12496 ...
## $ critics_score : num 45 96 91 80 33 91 57 17 90 83 ...
## $ best_pic_nom : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ best_pic_win : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ best_actor_win : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 2 1 1 ...
## $ best_actress_win: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ best_dir_win : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ top200_box : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
# Box Plots and Summary statistics on newly created variable feature_film
boxplot(audience_score ~ feature_film, data=movies,
main="Audience Score vs. Feature Film", xlab="Feature Film",
ylab="Audience Score")
summary(movies$feature_film)
## no yes
## 59 591
summary(movies[movies$feature_film=='no',]$audience_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.0 77.0 86.0 81.2 89.0 96.0
sd(movies[movies$feature_film=='no',]$audience_score)
## [1] 13.64043
summary(movies[movies$feature_film=='yes',]$audience_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.00 44.50 62.00 60.47 78.00 97.00
sd(movies[movies$feature_film=='yes',]$audience_score)
## [1] 19.82402
bayes_inference(y = audience_score, x = feature_film, data = movies, statistic = "mean", type = "ht",null=0, alternative = "twosided")
## Response variable: numerical, Explanatory variable: categorical (2 levels)
## n_no = 59, y_bar_no = 81.2034, s_no = 13.6404
## n_yes = 591, y_bar_yes = 60.4653, s_yes = 19.824
## (Assuming intrinsic prior on parameters)
## Hypotheses:
## H1: mu_no = mu_yes
## H2: mu_no != mu_yes
##
## Priors:
## P(H1) = 0.5
## P(H2) = 0.5
##
## Results:
## BF[H2:H1] = 2.794604e+13
## P(H1|data) = 0
## P(H2|data) = 1
# Box Plots and Summary statistics on newly created variable drama
boxplot(audience_score ~ drama, data=movies,
main="Audience Score vs. Drama Genre", xlab="Drama",
ylab="Audience Score")
summary(movies$drama)
## no yes
## 345 305
summary(movies[movies$drama=='no',]$audience_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.0 41.0 61.0 59.7 79.0 97.0
sd(movies[movies$drama=='no',]$audience_score)
## [1] 21.29807
summary(movies[movies$drama=='yes',]$audience_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 52.00 70.00 65.35 80.00 95.00
sd(movies[movies$drama=='yes',]$audience_score)
## [1] 18.54184
bayes_inference(y = audience_score, x = drama, data = movies, statistic = "mean", type = "ht",null=0, alternative = "twosided")
## Response variable: numerical, Explanatory variable: categorical (2 levels)
## n_no = 345, y_bar_no = 59.6957, s_no = 21.2981
## n_yes = 305, y_bar_yes = 65.3475, s_yes = 18.5418
## (Assuming intrinsic prior on parameters)
## Hypotheses:
## H1: mu_no = mu_yes
## H2: mu_no != mu_yes
##
## Priors:
## P(H1) = 0.5
## P(H2) = 0.5
##
## Results:
## BF[H2:H1] = 24.1609
## P(H1|data) = 0.0397
## P(H2|data) = 0.9603
# Box Plots and Summary statistics on newly created variable mpaa_rating_R
boxplot(audience_score ~ mpaa_rating_R, data=movies,
main="Audience Score vs. MPAA R", xlab="MPAA R",
ylab="Audience Score")
summary(movies$drama)
## no yes
## 345 305
summary(movies[movies$mpaa_rating_R=='no',]$audience_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.00 48.00 65.00 62.66 80.00 96.00
sd(movies[movies$mpaa_rating_R=='no',]$audience_score)
## [1] 20.34177
summary(movies[movies$mpaa_rating_R=='yes',]$audience_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.00 44.00 64.00 62.04 79.00 97.00
sd(movies[movies$mpaa_rating_R=='yes',]$audience_score)
## [1] 20.1559
bayes_inference(y = audience_score, x = mpaa_rating_R, data = movies, statistic = "mean", type = "ht",null=0, alternative = "twosided")
## Response variable: numerical, Explanatory variable: categorical (2 levels)
## n_no = 321, y_bar_no = 62.6604, s_no = 20.3418
## n_yes = 329, y_bar_yes = 62.0426, s_yes = 20.1559
## (Assuming intrinsic prior on parameters)
## Hypotheses:
## H1: mu_no = mu_yes
## H2: mu_no != mu_yes
##
## Priors:
## P(H1) = 0.5
## P(H2) = 0.5
##
## Results:
## BF[H1:H2] = 24.1282
## P(H1|data) = 0.9602
## P(H2|data) = 0.0398
# Box Plots and Summary statistics on newly created variable oscar_season
boxplot(audience_score ~ oscar_season, data=movies,
main="Audience Score vs. Oscar Season", xlab="Oscar Season",
ylab="Audience Score")
summary(movies$drama)
## no yes
## 345 305
summary(movies[movies$oscar_season=='no',]$audience_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.00 46.00 64.00 61.81 79.00 96.00
sd(movies[movies$oscar_season=='no',]$audience_score)
## [1] 20.11957
summary(movies[movies$oscar_season=='yes',]$audience_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 47.25 68.50 63.64 81.00 97.00
sd(movies[movies$oscar_season=='yes',]$audience_score)
## [1] 20.50625
bayes_inference(y = audience_score, x = oscar_season, data = movies, statistic = "mean", type = "ht",null=0, alternative = "twosided")
## Response variable: numerical, Explanatory variable: categorical (2 levels)
## n_no = 460, y_bar_no = 61.813, s_no = 20.1196
## n_yes = 190, y_bar_yes = 63.6421, s_yes = 20.5062
## (Assuming intrinsic prior on parameters)
## Hypotheses:
## H1: mu_no = mu_yes
## H2: mu_no != mu_yes
##
## Priors:
## P(H1) = 0.5
## P(H2) = 0.5
##
## Results:
## BF[H1:H2] = 13.7738
## P(H1|data) = 0.9323
## P(H2|data) = 0.0677
# Box Plots and Summary statistics on newly created variable summer_season
boxplot(audience_score ~ summer_season, data=movies,
main="Audience Score vs. Summer Season", xlab="Summer Season",
ylab="Audience Score")
summary(movies$drama)
## no yes
## 345 305
summary(movies[movies$summer_season=='no',]$audience_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.0 46.0 65.5 62.6 80.0 97.0
sd(movies[movies$summer_season=='no',]$audience_score)
## [1] 20.40385
summary(movies[movies$summer_season=='yes',]$audience_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.00 44.75 65.00 61.81 78.00 94.00
sd(movies[movies$summer_season=='yes',]$audience_score)
## [1] 19.90828
bayes_inference(y = audience_score, x = summer_season, data = movies, statistic = "mean", type = "ht",null=0, alternative = "twosided")
## Response variable: numerical, Explanatory variable: categorical (2 levels)
## n_no = 442, y_bar_no = 62.6018, s_no = 20.4039
## n_yes = 208, y_bar_yes = 61.8077, s_yes = 19.9083
## (Assuming intrinsic prior on parameters)
## Hypotheses:
## H1: mu_no = mu_yes
## H2: mu_no != mu_yes
##
## Priors:
## P(H1) = 0.5
## P(H2) = 0.5
##
## Results:
## BF[H1:H2] = 21.8404
## P(H1|data) = 0.9562
## P(H2|data) = 0.0438
movies.ZS = bas.lm(audience_score ~ .,
data = movies,
prior = "ZS-null",
modelprior = uniform(),
method = "MCMC")
movies.ZS
##
## Call:
## bas.lm(formula = audience_score ~ ., data = movies, prior = "ZS-null", modelprior = uniform(), method = "MCMC")
##
##
## Marginal Posterior Inclusion Probabilities:
## Intercept feature_filmyes dramayes
## 1.00000 0.06474 0.04289
## runtime mpaa_rating_Ryes thtr_rel_year
## 0.46329 0.19962 0.09140
## oscar_seasonyes summer_seasonyes imdb_rating
## 0.07460 0.08031 1.00000
## imdb_num_votes critics_score best_pic_nomyes
## 0.05781 0.88323 0.13360
## best_pic_winyes best_actor_winyes best_actress_winyes
## 0.03938 0.14372 0.14130
## best_dir_winyes top200_boxyes
## 0.06636 0.04693
# From the inclusion probabilities > ~ 0.5
movies.ZS = bas.lm(audience_score ~ imdb_rating + critics_score + runtime,
data = movies,
prior = "ZS-null",
modelprior = uniform(),
method = "MCMC")
diagnostics(movies.ZS)
## Warning in if (type == "pip") {: the condition has length > 1 and only the
## first element will be used
plot(movies.ZS, which=1)
plot(movies.ZS, which=2)
plot(movies.ZS, which=3)
plot(movies.ZS, which=4)
image(movies.ZS, rotate=F)
movies.coef = coefficients(movies.ZS)
par(mfrow=c(1,3))
plot(movies.coef)
par(mfrow=c(1,1))
movies.coef
##
## Marginal Posterior Summaries of Coefficients:
## post mean post SD post p(B != 0)
## Intercept 62.34769 0.39505 1.00000
## imdb_rating 14.96064 0.73944 1.00000
## critics_score 0.06358 0.03061 0.88549
## runtime -0.02551 0.03061 0.47244
movies.gprior = bas.lm(audience_score ~ imdb_rating + critics_score + runtime,
data = movies,
alpha=3, prior="g-prior")
movies.coef = coef(movies.gprior)
confint(movies.coef, approx=FALSE, nsim=10000)
## 2.5 % 97.5 % beta
## Intercept 61.57956474 6.309805e+01 62.34769231
## imdb_rating 10.24045642 1.244764e+01 11.28286550
## critics_score 0.00000000 8.363595e-02 0.04893579
## runtime -0.07020603 1.269273e-04 -0.03408574
## attr(,"Probability")
## [1] 0.95
## attr(,"class")
## [1] "confint.bas"
lovefriend <- data.frame(c(""))
lovefriend$audience_score <- 64
lovefriend$imdb_rating <- 6.7
lovefriend$critics_score <- 97
lovefriend$runtime <- 92
predict_score <- predict(movies.gprior, newdata=lovefriend, estimator="BMA", se.fit=T)
predict_score$fit
## [,1]
## [1,] 67.09629
sd(predict_score$Ypred)
## [1] 6.07372