This report is automatically generated with the R
package knitr
(version 1.5
)
.
# Chapter 15 Testing Differences and Relations Taking a Closer Look at Distributions # Observing beavers str(beaver2)
## 'data.frame': 100 obs. of 4 variables: ## $ day : num 307 307 307 307 307 307 307 307 307 307 ... ## $ time : num 930 940 950 1000 1010 1020 1030 1040 1050 1100 ... ## $ temp : num 36.6 36.7 36.9 37.1 37.2 ... ## $ activ: num 0 0 0 0 0 0 0 0 0 0 ...
## Testing normality graphically library(lattice) histogram(~temp | factor(activ), data = beaver2)
## Using quantile plots Comparing two samples qqplot(beaver2$temp[beaver2$activ == 1], beaver2$temp[beaver2$activ == 0])
### Using a QQ plot to check for normality qqnorm(beaver2$temp[beaver2$activ == 0], main = "Inactive") qqline(beaver2$temp[beaver2$activ == 0])
## Testing normality in a formal way shapiro.test(beaver2$temp)
## ## Shapiro-Wilk normality test ## ## data: beaver2$temp ## W = 0.9334, p-value = 7.764e-05
result <- shapiro.test(beaver2$temp) result$p.value
## [1] 7.764e-05
with(beaver2, tapply(temp, activ, shapiro.test))
## $`0` ## ## Shapiro-Wilk normality test ## ## data: X[[1L]] ## W = 0.9543, p-value = 0.1231 ## ## ## $`1` ## ## Shapiro-Wilk normality test ## ## data: X[[2L]] ## W = 0.9833, p-value = 0.5583
# Comparing Two Samples Testing differences Carrying out a t-test t.test(temp ~ activ, data = beaver2)
## ## Welch Two Sample t-test ## ## data: temp by activ ## t = -18.55, df = 80.85, p-value < 2.2e-16 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -0.8927 -0.7197 ## sample estimates: ## mean in group 0 mean in group 1 ## 37.1 37.9
activetemp <- beaver2$temp[beaver2$activ == 1] inactivetemp <- beaver2$temp[beaver2$activ == 0] t.test(activetemp, inactivetemp)
## ## Welch Two Sample t-test ## ## data: activetemp and inactivetemp ## t = 18.55, df = 80.85, p-value < 2.2e-16 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## 0.7197 0.8927 ## sample estimates: ## mean of x mean of y ## 37.9 37.1
### Dropping assumptions wilcox.test(temp ~ activ, data = beaver2)
## ## Wilcoxon rank sum test with continuity correction ## ## data: temp by activ ## W = 15, p-value < 2.2e-16 ## alternative hypothesis: true location shift is not equal to 0
### Testing direction Comparing paired data t.test(extra ~ group, data = sleep, paired = TRUE)
## ## Paired t-test ## ## data: extra by group ## t = -4.062, df = 9, p-value = 0.002833 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -2.4599 -0.7001 ## sample estimates: ## mean of the differences ## -1.58
# Testing Counts and Proportions Checking out proportions survivors <- matrix(c(1781, 1443, 135, 47), ncol = 2) colnames(survivors) <- c("survived", "died") rownames(survivors) <- c("no seat belt", "seat belt") survivors
## survived died ## no seat belt 1781 135 ## seat belt 1443 47
result.prop <- prop.test(survivors) result.prop
## ## 2-sample test for equality of proportions with continuity correction ## ## data: survivors ## X-squared = 24.33, df = 1, p-value = 8.105e-07 ## alternative hypothesis: two.sided ## 95 percent confidence interval: ## -0.05401 -0.02383 ## sample estimates: ## prop 1 prop 2 ## 0.9295 0.9685
## Analyzing tables Testing contingency of tables chisq.test(survivors)
## ## Pearson's Chi-squared test with Yates' continuity correction ## ## data: survivors ## X-squared = 24.33, df = 1, p-value = 8.105e-07
### Testing tables with more than two columns str(HairEyeColor)
## table [1:4, 1:4, 1:2] 32 53 10 3 11 50 10 30 10 25 ... ## - attr(*, "dimnames")=List of 3 ## ..$ Hair: chr [1:4] "Black" "Brown" "Red" "Blond" ## ..$ Eye : chr [1:4] "Brown" "Blue" "Hazel" "Green" ## ..$ Sex : chr [1:2] "Male" "Female"
HairEyeMargin <- margin.table(HairEyeColor, margin = c(1, 2)) HairEyeMargin
## Eye ## Hair Brown Blue Hazel Green ## Black 68 20 15 5 ## Brown 119 84 54 29 ## Red 26 17 14 14 ## Blond 7 94 10 16
chisq.test(HairEyeMargin)
## ## Pearson's Chi-squared test ## ## data: HairEyeMargin ## X-squared = 138.3, df = 9, p-value < 2.2e-16
## Extracting test results str(result)
## List of 4 ## $ statistic: Named num 0.933 ## ..- attr(*, "names")= chr "W" ## $ p.value : num 7.76e-05 ## $ method : chr "Shapiro-Wilk normality test" ## $ data.name: chr "beaver2$temp" ## - attr(*, "class")= chr "htest"
t.test(temp ~ activ, data = beaver2)$p.value
## [1] 7.269e-31
# Working with Models Analyzing variances str(InsectSprays)
## 'data.frame': 72 obs. of 2 variables: ## $ count: num 10 7 20 14 14 12 10 23 17 20 ... ## $ spray: Factor w/ 6 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
### Building the model AOVModel <- aov(count ~ spray, data = InsectSprays) ### Looking at the object AOVModel
## Call: ## aov(formula = count ~ spray, data = InsectSprays) ## ## Terms: ## spray Residuals ## Sum of Squares 2669 1015 ## Deg. of Freedom 5 66 ## ## Residual standard error: 3.922 ## Estimated effects may be unbalanced
## Evaluating the differences summary(AOVModel)
## Df Sum Sq Mean Sq F value Pr(>F) ## spray 5 2669 534 34.7 <2e-16 *** ## Residuals 66 1015 15 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### Checking the model tables model.tables(AOVModel, type = "effects")
## Tables of effects ## ## spray ## spray ## A B C D E F ## 5.000 5.833 -7.417 -4.583 -6.000 7.167
### Looking at the individual differences Comparisons <- TukeyHSD(AOVModel) Comparisons$spray["D-C", ]
## diff lwr upr p adj ## 2.8333 -1.8661 7.5327 0.4921
### Plotting the differences plot(Comparisons, las = 1)
## Modeling linear relations Building a linear model Model <- lm(mpg ~ wt, data = mtcars) ### Extracting information from the model coef.Model <- coef(Model) coef.Model
## (Intercept) wt ## 37.285 -5.344
plot(mpg ~ wt, data = mtcars) abline(a = coef.Model[1], b = coef.Model[2])
## Evaluating linear models Summarizing the model Model.summary <- summary(Model) Model.summary
## ## Call: ## lm(formula = mpg ~ wt, data = mtcars) ## ## Residuals: ## Min 1Q Median 3Q Max ## -4.543 -2.365 -0.125 1.410 6.873 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 37.285 1.878 19.86 < 2e-16 *** ## wt -5.344 0.559 -9.56 1.3e-10 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 3.05 on 30 degrees of freedom ## Multiple R-squared: 0.753, Adjusted R-squared: 0.745 ## F-statistic: 91.4 on 1 and 30 DF, p-value: 1.29e-10
coef(Model.summary)
## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 37.285 1.8776 19.858 8.242e-19 ## wt -5.344 0.5591 -9.559 1.294e-10
### Testing the impact of model terms Model.anova <- anova(Model) Model.anova
## Analysis of Variance Table ## ## Response: mpg ## Df Sum Sq Mean Sq F value Pr(>F) ## wt 1 848 848 91.4 1.3e-10 *** ## Residuals 30 278 9 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Model.anova["wt", "Pr(>F)"]
## [1] 1.294e-10
## Predicting new values Getting the values new.cars <- data.frame(wt = c(1.7, 2.4, 3.6)) predict(Model, newdata = new.cars)
## 1 2 3 ## 28.20 24.46 18.05
### Having confidence in your predictions predict(Model, newdata = new.cars, interval = "confidence")
## fit lwr upr ## 1 28.20 26.15 30.25 ## 2 24.46 23.02 25.90 ## 3 18.05 16.86 19.23
predict(Model, newdata = new.cars, interval = "prediction")
## fit lwr upr ## 1 28.20 21.65 34.75 ## 2 24.46 18.07 30.84 ## 3 18.05 11.71 24.38
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 3.0.2 (2013-09-25) ## Platform: x86_64-w64-mingw32/x64 (64-bit) ## ## locale: ## [1] LC_COLLATE=English_United Kingdom.1252 LC_CTYPE=English_United Kingdom.1252 ## [3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C ## [5] LC_TIME=English_United Kingdom.1252 ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] BiocInstaller_1.12.1 ggplot2_0.9.3.1 reshape2_1.2.2 sos_1.3-8 ## [5] brew_1.0-6 stringr_0.6.2 knitr_1.5 plyr_1.8 ## [9] Revobase_7.1.0 RevoMods_7.1.0 RevoScaleR_7.1.0 lattice_0.20-27 ## [13] rpart_4.1-2 ## ## loaded via a namespace (and not attached): ## [1] codetools_0.2-8 colorspace_1.2-4 dichromat_2.0-0 digest_0.6.4 ## [5] evaluate_0.5.1 foreach_1.4.1 formatR_0.10 fortunes_1.5-2 ## [9] grid_3.0.2 gtable_0.1.2 highr_0.3 iterators_1.0.6 ## [13] labeling_0.2 MASS_7.3-29 munsell_0.4.2 proto_0.3-10 ## [17] RColorBrewer_1.0-5 scales_0.2.3 tools_3.0.2
Sys.time()
## [1] "2014-05-13 15:06:16 BST"