Types | Examples |
---|---|
Integer: Natural numbers | Natural numbers 1, 2, 3 |
Numeric: Decimal values | 1.5, 2.2, 3.7 |
Logical: Boolean values | TRUE or FALSE (T or F) |
Character: Text or string values | “a” “cat” “blue” |
Dimension | Homogeneous | Heterogeneous |
---|---|---|
1d | vector | list |
2d | matrix | data frame |
nd | array |
Gender <- factor(c("male", "female", "female", "male"))
Gender
## [1] male female female male
## Levels: female male
mode(Gender)
## [1] "numeric"
str(Gender)
## Factor w/ 2 levels "female","male": 2 1 1 2
# if ggplot2 package is not installed, then install it now.
if (!require(ggplot2)) install.packages("ggplot2")
## Loading required package: ggplot2
library(ggplot2)
help(pressure)
str(pressure)
## 'data.frame': 19 obs. of 2 variables:
## $ temperature: num 0 20 40 60 80 100 120 140 160 180 ...
## $ pressure : num 0.0002 0.0012 0.006 0.03 0.09 0.27 0.75 1.85 4.2 8.8 ...
# plot using basic R plot function
plot(pressure$temperature, pressure$pressure, type="l", main="Basic R plot")
points(pressure$temperature, pressure$pressure)
## plot using ggplot
ggplot(pressure, aes(x=temperature, y=pressure)) +
geom_line() + geom_point() +
theme(text = element_text(size=20)) +
ggtitle("ggplot") +
theme(plot.title = element_text(hjust=0.5))
?faithful
str(faithful)
## 'data.frame': 272 obs. of 2 variables:
## $ eruptions: num 3.6 1.8 3.33 2.28 4.53 ...
## $ waiting : num 79 54 74 62 85 55 88 85 51 85 ...
# Basic R plot
hist(faithful$waiting)
#ggplot
ggplot(faithful, aes(x=waiting)) + geom_histogram() +
theme(text = element_text(size=20))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
str(faithful)
## 'data.frame': 272 obs. of 2 variables:
## $ eruptions: num 3.6 1.8 3.33 2.28 4.53 ...
## $ waiting : num 79 54 74 62 85 55 88 85 51 85 ...
?ToothGrowth
str(ToothGrowth)
## 'data.frame': 60 obs. of 3 variables:
## $ len : num 4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
## $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
## $ dose: num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
# Basic R plot
plot(ToothGrowth$supp, ToothGrowth$len)
# Or
boxplot(len ~ supp, data = ToothGrowth)
# ggplot
ggplot(ToothGrowth, aes(x=supp, y=len)) + geom_boxplot()
#Basic plot
boxplot(len ~ supp + dose, data = ToothGrowth)
# ggplot
ggplot(ToothGrowth, aes(x=interaction(supp, dose), y=len)) + geom_boxplot() +
theme(text = element_text(size=20))
if (!require(datarium)) install.packages("datarium")
## Loading required package: datarium
library(datarium)
?marketing
str(marketing)
## 'data.frame': 200 obs. of 4 variables:
## $ youtube : num 276.1 53.4 20.6 181.8 217 ...
## $ facebook : num 45.4 47.2 55.1 49.6 13 ...
## $ newspaper: num 83 54.1 83.2 70.2 70.1 ...
## $ sales : num 26.5 12.5 11.2 22.2 15.5 ...
# Youtube
ggplot(marketing, aes(x = youtube, y = sales)) + geom_point()
#Facebook
ggplot(marketing, aes(x = facebook, y = sales)) + geom_point()
model <- lm(sales ~ youtube, data = marketing)
model
##
## Call:
## lm(formula = sales ~ youtube, data = marketing)
##
## Coefficients:
## (Intercept) youtube
## 8.43911 0.04754
ggplot(marketing, aes(youtube, sales)) + geom_point() +
stat_smooth(method = lm)
## `geom_smooth()` using formula 'y ~ x'
summary(model)
##
## Call:
## lm(formula = sales ~ youtube, data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.0632 -2.3454 -0.2295 2.4805 8.6548
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.439112 0.549412 15.36 <2e-16 ***
## youtube 0.047537 0.002691 17.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.91 on 198 degrees of freedom
## Multiple R-squared: 0.6119, Adjusted R-squared: 0.6099
## F-statistic: 312.1 on 1 and 198 DF, p-value: < 2.2e-16
model <- lm(sales ~ youtube + facebook + newspaper, data = marketing)
model
##
## Call:
## lm(formula = sales ~ youtube + facebook + newspaper, data = marketing)
##
## Coefficients:
## (Intercept) youtube facebook newspaper
## 3.526667 0.045765 0.188530 -0.001037
summary(model)
##
## Call:
## lm(formula = sales ~ youtube + facebook + newspaper, data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5932 -1.0690 0.2902 1.4272 3.3951
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.526667 0.374290 9.422 <2e-16 ***
## youtube 0.045765 0.001395 32.809 <2e-16 ***
## facebook 0.188530 0.008611 21.893 <2e-16 ***
## newspaper -0.001037 0.005871 -0.177 0.86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.023 on 196 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
## F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
insurance <- read.csv("insurance.csv")
str(insurance)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ gender : chr "female" "male" "male" "male" ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr "yes" "no" "no" "no" ...
## $ region : chr "southwest" "southeast" "southeast" "northwest" ...
## $ charges : num 16885 1726 4449 21984 3867 ...
summary(insurance$charges)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1122 4740 9382 13270 16640 63770
Here, mean is larger than median.
Because the mean value is greater than the median, this implies that the distribution of insurance charges is right-skewed
#Basic plot
hist(insurance$charges)
#ggplot
ggplot(insurance, aes(charges)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table(insurance$region)
##
## northeast northwest southeast southwest
## 324 325 364 325
table(insurance$gender)
##
## female male
## 662 676
table(insurance$smoker)
##
## no yes
## 1064 274
To create a correlation matrix for the four numeric variables in the insurance data frame, use the cor() command:
cor(insurance[c("age", "bmi", "children", "charges")])
## age bmi children charges
## age 1.0000000 0.1092719 0.04246900 0.29900819
## bmi 0.1092719 1.0000000 0.01275890 0.19834097
## children 0.0424690 0.0127589 1.00000000 0.06799823
## charges 0.2990082 0.1983410 0.06799823 1.00000000
pairs(insurance[c("age", "bmi", "children", "charges")])
An enhanced scatterplot matrix can be created with the ** pairs.panels() function in the psych package.
if(!require(psych)) install.packages("psych")
## Loading required package: psych
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(psych)
pairs.panels(insurance[c("age", "bmi", "children", "charges")])
ins_model <- lm(charges ~ age + children + bmi + gender + smoker
+ region, data = insurance)
ins_model
##
## Call:
## lm(formula = charges ~ age + children + bmi + gender + smoker +
## region, data = insurance)
##
## Coefficients:
## (Intercept) age children bmi
## -11938.5 256.9 475.5 339.2
## gendermale smokeryes regionnorthwest regionsoutheast
## -131.3 23848.5 -353.0 -1035.0
## regionsouthwest
## -960.1
summary(ins_model)
##
## Call:
## lm(formula = charges ~ age + children + bmi + gender + smoker +
## region, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11304.9 -2848.1 -982.1 1393.9 29992.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11938.5 987.8 -12.086 < 2e-16 ***
## age 256.9 11.9 21.587 < 2e-16 ***
## children 475.5 137.8 3.451 0.000577 ***
## bmi 339.2 28.6 11.860 < 2e-16 ***
## gendermale -131.3 332.9 -0.394 0.693348
## smokeryes 23848.5 413.1 57.723 < 2e-16 ***
## regionnorthwest -353.0 476.3 -0.741 0.458769
## regionsoutheast -1035.0 478.7 -2.162 0.030782 *
## regionsouthwest -960.0 477.9 -2.009 0.044765 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared: 0.7509, Adjusted R-squared: 0.7494
## F-statistic: 500.8 on 8 and 1329 DF, p-value: < 2.2e-16
insurance$age2 <- insurance$age^2
Suppose we have a hunch that the effect of a feature is not cumulative, but rather it has an effect only once a specific threshold has been reached.
For instance, BMI may have zero impact on medical expenditures for individuals in the normal weight range, but it may be strongly related to higher costs for the obese (that is, BMI of 30 or above).
#For BMI greater than or equal to 30, we will return 1,otherwise 0:
insurance$bmi30 <- ifelse(insurance$bmi >= 30, 1, 0)
charges ~ bmi30*smoker
## charges ~ bmi30 * smoker
# the * operator is shorthand that instructs R to model
charges ~ bmi30 + smokeryes + bmi30:smokeryes
## charges ~ bmi30 + smokeryes + bmi30:smokeryes
The : (colon) operator in the expanded form indicates that bmi30:smokeryes is the interaction between the two variables.
ins_model2 <- lm(charges ~ age + age2 + children + bmi + gender
+ bmi30*smoker + region, data = insurance)
ins_model2
##
## Call:
## lm(formula = charges ~ age + age2 + children + bmi + gender +
## bmi30 * smoker + region, data = insurance)
##
## Coefficients:
## (Intercept) age age2 children
## 134.251 -32.685 3.732 678.561
## bmi gendermale bmi30 smokeryes
## 120.020 -496.824 -1000.140 13404.687
## regionnorthwest regionsoutheast regionsouthwest bmi30:smokeryes
## -279.204 -828.547 -1222.644 19810.753
summary(ins_model2)
##
## Call:
## lm(formula = charges ~ age + age2 + children + bmi + gender +
## bmi30 * smoker + region, data = insurance)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17296.4 -1656.0 -1263.3 -722.1 24160.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 134.2509 1362.7511 0.099 0.921539
## age -32.6851 59.8242 -0.546 0.584915
## age2 3.7316 0.7463 5.000 6.50e-07 ***
## children 678.5612 105.8831 6.409 2.04e-10 ***
## bmi 120.0196 34.2660 3.503 0.000476 ***
## gendermale -496.8245 244.3659 -2.033 0.042240 *
## bmi30 -1000.1403 422.8402 -2.365 0.018159 *
## smokeryes 13404.6866 439.9491 30.469 < 2e-16 ***
## regionnorthwest -279.2038 349.2746 -0.799 0.424212
## regionsoutheast -828.5467 351.6352 -2.356 0.018604 *
## regionsouthwest -1222.6437 350.5285 -3.488 0.000503 ***
## bmi30:smokeryes 19810.7533 604.6567 32.764 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4445 on 1326 degrees of freedom
## Multiple R-squared: 0.8664, Adjusted R-squared: 0.8653
## F-statistic: 781.7 on 11 and 1326 DF, p-value: < 2.2e-16
perf_ins_model <- summary(ins_model)
perf_ins_model2 <- summary(ins_model2)
#Obtain the attribute list of the object perf_ins_model
attributes(perf_ins_model)
## $names
## [1] "call" "terms" "residuals" "coefficients"
## [5] "aliased" "sigma" "df" "r.squared"
## [9] "adj.r.squared" "fstatistic" "cov.unscaled"
##
## $class
## [1] "summary.lm"
#Extract R square and adjust R square values of the two models and save them into a data frame.
model_comp <- data.frame(r.square=perf_ins_model$r.squared, adj.r.squared=perf_ins_model$adj.r.squared)
model_comp <- rbind(model_comp, c(r.square=perf_ins_model2$r.squared, adj.r.squared=perf_ins_model2$adj.r.squared ))
rownames(model_comp) = c("model1", "model2")
if(!require(DT)) install.packages("DT")
## Loading required package: DT
library(DT)
datatable(round(model_comp, 3))