library(magrittr)
library(ggplot2)
library(plyr)
library(dplyr)
load("brfss2013.RData")
removeNA <- function(data, desiredCols) {
completeVec <- complete.cases(data[, desiredCols])
return(data[completeVec, ])
}
There are almost 500,000 observations and 330 variables, and according to the CDC, some of the variables are optional.
The Behavioral Risk Factor Surveillance System BRFSS is the nation’s premier system of health-related telephone surveys that collect state data about U.S. residents regarding their health-related risk behaviors, chronic health conditions, and use of preventive services. This institution was established in 1984. The BRFSS objective is to collect uniform, state-specific data on preventive health practices and risk behaviors that are linked to chronic diseases, injuries, and preventable infectious diseases that affect the adult population. Factors assessed by the BRFSS in 2013 include tobacco use, HIV/AIDS knowledge and prevention, exercise, immunization, health status, healthy days — health-related quality of life, health care access, inadequate sleep, hypertension awareness, cholesterol awareness, chronic health conditions, alcohol consumption, fruits and vegetables consumption, arthritis burden, and seatbelt use.
The data is collected across 50 states from randomly selected households those who had a landline. Those participated from the household were strictly based on the person who volunteered and were 18 years of age or over.
The non-institutionalized adult population (18 years of age and older) residing in the US.
According to the data, BRFSS randomly selects households from the states. However it’s upto the person in the family to volunteer for the survey. Therefore; I don’t belive the data can be generalized to the the entire 18 and above non-institutionalized adult population of united states. For example, I don’t expect people who are 18 and above who have a chronic illness to be very interested to participate in the survey which is as long as 20 minutes. So the survey can be gneneralized only to the relatively healthy population of United States who are 18 and above and are noninstitutionalized.
Again there was no random assignment therefore we can’t confirm causality. However our statements can be purely correlational.
Research Question 1: How does the overall health of people vary across the state and is it related to the cost associated towards visiting a doctor ?
Research Question 2:
Does the general health gets affected by the month in which the survey was conducted across the states and how does the health status depend on the employment status of an individual ?
Research Question 3:
How does the BMI index categories define the overall health of a person and does the overall health gets affected by whether a person exercises atleast once a month ?
Research Question 1:Analysis
brfss_research1 <- removeNA(brfss2013,c('genhlth','medcost','X_state'))
brfss_p_s <- brfss_research1 %>% select(c('genhlth','medcost','X_state'))
brfss_p_s_1 <- brfss_research1 %>% select(c('genhlth','X_state'))
brfss_state_data <- brfss_p_s_1 %>% group_by(X_state)
brfss_state_data1 <- mutate(brfss_state_data,genhlth1 = ifelse(genhlth!='Poor','Good','Poor'))
brfss_state_data1$genhlth <- NULL
brfss_state_data2 <- brfss_state_data1 %>% group_by(X_state,genhlth1) %>% summarise(freq = n())
brfss_state_data3 <- brfss_state_data2 %>% group_by(X_state) %>%summarize(total= sum(freq))
brfss_state_data4 <- merge.data.frame(x = brfss_state_data2,y = brfss_state_data3,by = "X_state")
brfss_state_data5 <- brfss_state_data4 %>% mutate(rel_freq = freq/total)
brfss_state_data5 <- brfss_state_data5 %>% filter(genhlth1 == 'Poor')
brfss_state_data5 <- brfss_state_data5 %>% arrange(desc(rel_freq))
# The relative frequency of people across states who couldn't see a doc because of cost associated.
brfss_m_s_1 <- brfss_p_s %>% select(c('medcost','X_state'))
brfss_state_data6 <- brfss_m_s_1 %>% group_by(X_state,medcost) %>% summarise(freq = n())
brfss_state_data7 <- brfss_state_data6 %>% group_by(X_state) %>%summarize(total= sum(freq))
brfss_state_data6 <- brfss_state_data6 %>% filter(medcost == 'Yes')
brfss_state_data8 <- merge.data.frame(x = brfss_state_data6,y = brfss_state_data7,by = "X_state")
brfss_state_data8 <- brfss_state_data8 %>% mutate(rel_freq = freq/total)
brfss_state_data5 <- rename(brfss_state_data5,poor_hlth_freq = freq, rel_poor_hlth_freq = rel_freq)
brfss_state_data8 <- rename(brfss_state_data8,freq_no_money_for_doc = freq, rel_no_money_for_doc = rel_freq)
brfss_state_data9 <- merge.data.frame(x = brfss_state_data5,y = brfss_state_data8,by = "X_state")
cor(brfss_state_data9$rel_poor_hlth_freq, brfss_state_data9$rel_no_money_for_doc)
## [1] 0.620436
ggplot(brfss_state_data9, aes(x=rel_poor_hlth_freq, y=rel_no_money_for_doc)) +
geom_point(shape=18, color="blue")+
geom_smooth(method=lm, linetype="dashed",
color="darkred", fill="blue")+xlab("Percent of people across a state respond as having \"POOR\" health") + ylab("Percentage of people who have no Money to see a doc.")
Research Question 2:Analysis
brfss_2 <- removeNA(brfss2013,c('genhlth','imonth','X_state'))
brfss_2 <- brfss_2 %>% select(c('genhlth','imonth','X_state'))
brfss_5 <- brfss_2 %>% group_by(X_state,imonth) %>% summarise(freq = n())
brfss_3 <- brfss_2 %>% filter(genhlth == 'Poor')
brfss_4 <- brfss_3 %>% group_by(X_state,imonth) %>% summarise(freq = n())
brfss_6 <- merge.data.frame(x = brfss_4,y = brfss_5,by = c("X_state","imonth"))
brfss_7 <- brfss_6 %>% mutate(rel_freq = freq.x/freq.y)
brfss_8 <- brfss_7 %>% select(c('rel_freq','imonth','X_state'))
brfss_8 <- brfss_8 %>% mutate(per_centrel_freq = rel_freq * 100)
ggplot(data=brfss_8, aes(x=imonth, y=per_centrel_freq, fill=imonth)) +geom_boxplot()+xlab("Month in which data is collected") + ylab("Distribution of POOR Health frequency in 50 states") + scale_fill_discrete(name="Month")
brfss_5 <- removeNA(brfss2013,c('genhlth','employ1'))
ggplot(brfss_5) + aes(x=genhlth,fill=employ1) + geom_bar(position = "fill", colour="black") + xlab("Overall Health Status") + ylab("Proportion of people employed with different employment status") + scale_fill_discrete(name="Employment Status")
Research Question 3:Analysis
brfss_research4 <- removeNA(brfss2013,c('genhlth','exerany2','X_bmi5cat'))
brfss_research4 <- brfss_research4 %>% select(c('genhlth','exerany2','X_bmi5cat'))
ggplot(brfss_research4) + aes(x=exerany2,fill=genhlth) + geom_bar(position = "fill",colour="black") + facet_grid(.~X_bmi5cat)+ xlab("BMI category per people who exerise atleast once a month") + ylab("Proportion") + scale_fill_discrete(name="Overall Health Condition")
This plot provides a picture about the overall health status of people who exercise atleast once a month and how does it very acorss the BMI index categories.