R Markdown

R data types

Types Examples
Integer: Natural numbers Natural numbers 1, 2, 3
Numeric: Decimal values 1.5, 2.2, 3.7
Logical: Boolean values TRUE or FALSE (T or F)
Character: Text or string values “a” “cat” “blue”

Data Structures

Dimension Homogeneous Heterogeneous
1d vector list
2d matrix data frame
nd array

Factor

Gender <- factor(c("male", "female", "female", "male"))
Gender
## [1] male   female female male  
## Levels: female male
mode(Gender)
## [1] "numeric"
str(Gender)
##  Factor w/ 2 levels "female","male": 2 1 1 2

Advanced graphics: ggplot2

# if  ggplot2 package is not installed, then install it now.
if (!require(ggplot2)) install.packages("ggplot2")
## Loading required package: ggplot2
library(ggplot2)

Line graph

help(pressure)
str(pressure)
## 'data.frame':    19 obs. of  2 variables:
##  $ temperature: num  0 20 40 60 80 100 120 140 160 180 ...
##  $ pressure   : num  0.0002 0.0012 0.006 0.03 0.09 0.27 0.75 1.85 4.2 8.8 ...
Question: How to show the presssure change versus temperature?
# plot using basic R plot function
plot(pressure$temperature, pressure$pressure, type="l", main="Basic R plot")
points(pressure$temperature, pressure$pressure)

## plot using ggplot
ggplot(pressure, aes(x=temperature, y=pressure)) +
  geom_line() + geom_point() + 
  theme(text = element_text(size=20)) +
  ggtitle("ggplot") + 
  theme(plot.title = element_text(hjust=0.5))

Histogram

?faithful

str(faithful)
## 'data.frame':    272 obs. of  2 variables:
##  $ eruptions: num  3.6 1.8 3.33 2.28 4.53 ...
##  $ waiting  : num  79 54 74 62 85 55 88 85 51 85 ...
Question: Plot distribution of waiting time
# Basic R plot
hist(faithful$waiting)

#ggplot
ggplot(faithful, aes(x=waiting)) + geom_histogram() + 
  theme(text = element_text(size=20))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

str(faithful)
## 'data.frame':    272 obs. of  2 variables:
##  $ eruptions: num  3.6 1.8 3.33 2.28 4.53 ...
##  $ waiting  : num  79 54 74 62 85 55 88 85 51 85 ...

Boxplot

?ToothGrowth
str(ToothGrowth)
## 'data.frame':    60 obs. of  3 variables:
##  $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
##  $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dose: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
Question: How to compare tooth growth of pigs under different conditions?
#  Basic R plot
plot(ToothGrowth$supp, ToothGrowth$len) 

# Or 
boxplot(len ~ supp, data = ToothGrowth)

# ggplot
ggplot(ToothGrowth, aes(x=supp, y=len)) + geom_boxplot()

Question: Is this tooth growth pattern for the two dlievery methods same for all the three dose levels?
#Basic plot
boxplot(len ~ supp + dose, data = ToothGrowth)

# ggplot
ggplot(ToothGrowth, aes(x=interaction(supp, dose), y=len)) + geom_boxplot() + 
  theme(text = element_text(size=20))

Marketing Data

if (!require(datarium)) install.packages("datarium")
## Loading required package: datarium
library(datarium)
?marketing
str(marketing)
## 'data.frame':    200 obs. of  4 variables:
##  $ youtube  : num  276.1 53.4 20.6 181.8 217 ...
##  $ facebook : num  45.4 47.2 55.1 49.6 13 ...
##  $ newspaper: num  83 54.1 83.2 70.2 70.1 ...
##  $ sales    : num  26.5 12.5 11.2 22.2 15.5 ...
Question: Can we predict sale using advertising budget?

Visualization

# Youtube
ggplot(marketing, aes(x = youtube, y = sales)) + geom_point() 

#Facebook
ggplot(marketing, aes(x = facebook, y = sales)) + geom_point() 

A simple linear regression model

model <- lm(sales ~ youtube, data = marketing)
model
## 
## Call:
## lm(formula = sales ~ youtube, data = marketing)
## 
## Coefficients:
## (Intercept)      youtube  
##     8.43911      0.04754

Plot linear regressin line

ggplot(marketing, aes(youtube, sales)) + geom_point() +
stat_smooth(method = lm)
## `geom_smooth()` using formula 'y ~ x'

View the model

summary(model)
## 
## Call:
## lm(formula = sales ~ youtube, data = marketing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.0632  -2.3454  -0.2295   2.4805   8.6548 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 8.439112   0.549412   15.36   <2e-16 ***
## youtube     0.047537   0.002691   17.67   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.91 on 198 degrees of freedom
## Multiple R-squared:  0.6119, Adjusted R-squared:  0.6099 
## F-statistic: 312.1 on 1 and 198 DF,  p-value: < 2.2e-16

A Multiple linear regression model

model <- lm(sales ~ youtube + facebook + newspaper, data = marketing)
model
## 
## Call:
## lm(formula = sales ~ youtube + facebook + newspaper, data = marketing)
## 
## Coefficients:
## (Intercept)      youtube     facebook    newspaper  
##    3.526667     0.045765     0.188530    -0.001037
summary(model)
## 
## Call:
## lm(formula = sales ~ youtube + facebook + newspaper, data = marketing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5932  -1.0690   0.2902   1.4272   3.3951 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.526667   0.374290   9.422   <2e-16 ***
## youtube      0.045765   0.001395  32.809   <2e-16 ***
## facebook     0.188530   0.008611  21.893   <2e-16 ***
## newspaper   -0.001037   0.005871  -0.177     0.86    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.023 on 196 degrees of freedom
## Multiple R-squared:  0.8972, Adjusted R-squared:  0.8956 
## F-statistic: 570.3 on 3 and 196 DF,  p-value: < 2.2e-16

Simulated medical expense data

insurance <- read.csv("insurance.csv")
str(insurance)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ gender  : chr  "female" "male" "male" "male" ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr  "yes" "no" "no" "no" ...
##  $ region  : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num  16885 1726 4449 21984 3867 ...

Explore data

summary(insurance$charges)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1122    4740    9382   13270   16640   63770

Here, mean is larger than median.

Question: What kind of distribution of insurance charge you expect?

Because the mean value is greater than the median, this implies that the distribution of insurance charges is right-skewed

#Basic plot
hist(insurance$charges)

#ggplot
ggplot(insurance, aes(charges)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Categorical variables

table(insurance$region)
## 
## northeast northwest southeast southwest 
##       324       325       364       325
table(insurance$gender)
## 
## female   male 
##    662    676
table(insurance$smoker)
## 
##   no  yes 
## 1064  274

Correlation matrix

To create a correlation matrix for the four numeric variables in the insurance data frame, use the cor() command:

cor(insurance[c("age", "bmi", "children", "charges")])
##                age       bmi   children    charges
## age      1.0000000 0.1092719 0.04246900 0.29900819
## bmi      0.1092719 1.0000000 0.01275890 0.19834097
## children 0.0424690 0.0127589 1.00000000 0.06799823
## charges  0.2990082 0.1983410 0.06799823 1.00000000

Visualization relationships among features - the scatterplot matrix

pairs(insurance[c("age", "bmi", "children", "charges")])

Enhanced scatterplot matrix

An enhanced scatterplot matrix can be created with the ** pairs.panels() function in the psych package.

if(!require(psych)) install.packages("psych")
## Loading required package: psych
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(psych)
pairs.panels(insurance[c("age", "bmi", "children", "charges")])

Creat a linear regression model for predicting medical expense

ins_model <- lm(charges ~ age + children + bmi + gender + smoker
    + region, data = insurance)
ins_model
## 
## Call:
## lm(formula = charges ~ age + children + bmi + gender + smoker + 
##     region, data = insurance)
## 
## Coefficients:
##     (Intercept)              age         children              bmi  
##        -11938.5            256.9            475.5            339.2  
##      gendermale        smokeryes  regionnorthwest  regionsoutheast  
##          -131.3          23848.5           -353.0          -1035.0  
## regionsouthwest  
##          -960.1

Model performance

summary(ins_model)
## 
## Call:
## lm(formula = charges ~ age + children + bmi + gender + smoker + 
##     region, data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11304.9  -2848.1   -982.1   1393.9  29992.8 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -11938.5      987.8 -12.086  < 2e-16 ***
## age                256.9       11.9  21.587  < 2e-16 ***
## children           475.5      137.8   3.451 0.000577 ***
## bmi                339.2       28.6  11.860  < 2e-16 ***
## gendermale        -131.3      332.9  -0.394 0.693348    
## smokeryes        23848.5      413.1  57.723  < 2e-16 ***
## regionnorthwest   -353.0      476.3  -0.741 0.458769    
## regionsoutheast  -1035.0      478.7  -2.162 0.030782 *  
## regionsouthwest   -960.0      477.9  -2.009 0.044765 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared:  0.7509, Adjusted R-squared:  0.7494 
## F-statistic: 500.8 on 8 and 1329 DF,  p-value: < 2.2e-16

Improve prediction model: non-linear variable

insurance$age2 <- insurance$age^2

Improve prediction model: converting a numeric variable to a binary indicator

#For BMI greater than or equal to 30, we will return 1,otherwise 0:
insurance$bmi30 <- ifelse(insurance$bmi >= 30, 1, 0)

Imporve model: adding interaction effects

charges ~ bmi30*smoker
## charges ~ bmi30 * smoker
# the * operator is shorthand that instructs R to model 
charges ~ bmi30 + smokeryes + bmi30:smokeryes
## charges ~ bmi30 + smokeryes + bmi30:smokeryes

The : (colon) operator in the expanded form indicates that bmi30:smokeryes is the interaction between the two variables.

Build an improved prediction model

ins_model2 <- lm(charges ~ age + age2 + children + bmi + gender
     + bmi30*smoker + region, data = insurance)
ins_model2
## 
## Call:
## lm(formula = charges ~ age + age2 + children + bmi + gender + 
##     bmi30 * smoker + region, data = insurance)
## 
## Coefficients:
##     (Intercept)              age             age2         children  
##         134.251          -32.685            3.732          678.561  
##             bmi       gendermale            bmi30        smokeryes  
##         120.020         -496.824        -1000.140        13404.687  
## regionnorthwest  regionsoutheast  regionsouthwest  bmi30:smokeryes  
##        -279.204         -828.547        -1222.644        19810.753

Model performance

summary(ins_model2)
## 
## Call:
## lm(formula = charges ~ age + age2 + children + bmi + gender + 
##     bmi30 * smoker + region, data = insurance)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17296.4  -1656.0  -1263.3   -722.1  24160.2 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       134.2509  1362.7511   0.099 0.921539    
## age               -32.6851    59.8242  -0.546 0.584915    
## age2                3.7316     0.7463   5.000 6.50e-07 ***
## children          678.5612   105.8831   6.409 2.04e-10 ***
## bmi               120.0196    34.2660   3.503 0.000476 ***
## gendermale       -496.8245   244.3659  -2.033 0.042240 *  
## bmi30           -1000.1403   422.8402  -2.365 0.018159 *  
## smokeryes       13404.6866   439.9491  30.469  < 2e-16 ***
## regionnorthwest  -279.2038   349.2746  -0.799 0.424212    
## regionsoutheast  -828.5467   351.6352  -2.356 0.018604 *  
## regionsouthwest -1222.6437   350.5285  -3.488 0.000503 ***
## bmi30:smokeryes 19810.7533   604.6567  32.764  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4445 on 1326 degrees of freedom
## Multiple R-squared:  0.8664, Adjusted R-squared:  0.8653 
## F-statistic: 781.7 on 11 and 1326 DF,  p-value: < 2.2e-16

Model comparison

perf_ins_model <- summary(ins_model)
perf_ins_model2 <- summary(ins_model2)

#Obtain the attribute list of the object perf_ins_model
attributes(perf_ins_model)
## $names
##  [1] "call"          "terms"         "residuals"     "coefficients" 
##  [5] "aliased"       "sigma"         "df"            "r.squared"    
##  [9] "adj.r.squared" "fstatistic"    "cov.unscaled" 
## 
## $class
## [1] "summary.lm"
#Extract R square and adjust R square values of the two models and save them into a data frame.
model_comp <- data.frame(r.square=perf_ins_model$r.squared, adj.r.squared=perf_ins_model$adj.r.squared)

model_comp <- rbind(model_comp, c(r.square=perf_ins_model2$r.squared, adj.r.squared=perf_ins_model2$adj.r.squared ))
rownames(model_comp) = c("model1", "model2")

if(!require(DT)) install.packages("DT")
## Loading required package: DT
library(DT)
datatable(round(model_comp, 3))

Reference

  • R Graphics Cookbook by Winston Wang
  • Using R for Data Analysis and Graphics by J H Maindonald The Art of R Programming by Norman Matolff
  • Machine Learning with R by BrettLantz