This report is automatically generated with the R package knitr (version 1.5) .

# Chapter 15 Testing Differences and Relations Taking a Closer Look at Distributions
# Observing beavers
str(beaver2)
## 'data.frame':	100 obs. of  4 variables:
##  $ day  : num  307 307 307 307 307 307 307 307 307 307 ...
##  $ time : num  930 940 950 1000 1010 1020 1030 1040 1050 1100 ...
##  $ temp : num  36.6 36.7 36.9 37.1 37.2 ...
##  $ activ: num  0 0 0 0 0 0 0 0 0 0 ...
## Testing normality graphically
library(lattice)
histogram(~temp | factor(activ), data = beaver2)
plot of chunk unnamed-chunk-1
## Using quantile plots Comparing two samples
qqplot(beaver2$temp[beaver2$activ == 1], beaver2$temp[beaver2$activ == 0])
plot of chunk unnamed-chunk-1
### Using a QQ plot to check for normality
qqnorm(beaver2$temp[beaver2$activ == 0], main = "Inactive")
qqline(beaver2$temp[beaver2$activ == 0])
plot of chunk unnamed-chunk-1
## Testing normality in a formal way
shapiro.test(beaver2$temp)
## 
## 	Shapiro-Wilk normality test
## 
## data:  beaver2$temp
## W = 0.9334, p-value = 7.764e-05
result <- shapiro.test(beaver2$temp)
result$p.value
## [1] 7.764e-05
with(beaver2, tapply(temp, activ, shapiro.test))
## $`0`
## 
## 	Shapiro-Wilk normality test
## 
## data:  X[[1L]]
## W = 0.9543, p-value = 0.1231
## 
## 
## $`1`
## 
## 	Shapiro-Wilk normality test
## 
## data:  X[[2L]]
## W = 0.9833, p-value = 0.5583
# Comparing Two Samples Testing differences Carrying out a t-test
t.test(temp ~ activ, data = beaver2)
## 
## 	Welch Two Sample t-test
## 
## data:  temp by activ
## t = -18.55, df = 80.85, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.8927 -0.7197
## sample estimates:
## mean in group 0 mean in group 1 
##            37.1            37.9
activetemp <- beaver2$temp[beaver2$activ == 1]
inactivetemp <- beaver2$temp[beaver2$activ == 0]
t.test(activetemp, inactivetemp)
## 
## 	Welch Two Sample t-test
## 
## data:  activetemp and inactivetemp
## t = 18.55, df = 80.85, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.7197 0.8927
## sample estimates:
## mean of x mean of y 
##      37.9      37.1
### Dropping assumptions
wilcox.test(temp ~ activ, data = beaver2)
## 
## 	Wilcoxon rank sum test with continuity correction
## 
## data:  temp by activ
## W = 15, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
### Testing direction Comparing paired data
t.test(extra ~ group, data = sleep, paired = TRUE)
## 
## 	Paired t-test
## 
## data:  extra by group
## t = -4.062, df = 9, p-value = 0.002833
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.4599 -0.7001
## sample estimates:
## mean of the differences 
##                   -1.58
# Testing Counts and Proportions Checking out proportions
survivors <- matrix(c(1781, 1443, 135, 47), ncol = 2)
colnames(survivors) <- c("survived", "died")
rownames(survivors) <- c("no seat belt", "seat belt")
survivors
##              survived died
## no seat belt     1781  135
## seat belt        1443   47
result.prop <- prop.test(survivors)
result.prop
## 
## 	2-sample test for equality of proportions with continuity correction
## 
## data:  survivors
## X-squared = 24.33, df = 1, p-value = 8.105e-07
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.05401 -0.02383
## sample estimates:
## prop 1 prop 2 
## 0.9295 0.9685
## Analyzing tables Testing contingency of tables
chisq.test(survivors)
## 
## 	Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  survivors
## X-squared = 24.33, df = 1, p-value = 8.105e-07
### Testing tables with more than two columns
str(HairEyeColor)
##  table [1:4, 1:4, 1:2] 32 53 10 3 11 50 10 30 10 25 ...
##  - attr(*, "dimnames")=List of 3
##   ..$ Hair: chr [1:4] "Black" "Brown" "Red" "Blond"
##   ..$ Eye : chr [1:4] "Brown" "Blue" "Hazel" "Green"
##   ..$ Sex : chr [1:2] "Male" "Female"
HairEyeMargin <- margin.table(HairEyeColor, margin = c(1, 2))
HairEyeMargin
##        Eye
## Hair    Brown Blue Hazel Green
##   Black    68   20    15     5
##   Brown   119   84    54    29
##   Red      26   17    14    14
##   Blond     7   94    10    16
chisq.test(HairEyeMargin)
## 
## 	Pearson's Chi-squared test
## 
## data:  HairEyeMargin
## X-squared = 138.3, df = 9, p-value < 2.2e-16
## Extracting test results
str(result)
## List of 4
##  $ statistic: Named num 0.933
##   ..- attr(*, "names")= chr "W"
##  $ p.value  : num 7.76e-05
##  $ method   : chr "Shapiro-Wilk normality test"
##  $ data.name: chr "beaver2$temp"
##  - attr(*, "class")= chr "htest"
t.test(temp ~ activ, data = beaver2)$p.value
## [1] 7.269e-31
# Working with Models Analyzing variances
str(InsectSprays)
## 'data.frame':	72 obs. of  2 variables:
##  $ count: num  10 7 20 14 14 12 10 23 17 20 ...
##  $ spray: Factor w/ 6 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
### Building the model
AOVModel <- aov(count ~ spray, data = InsectSprays)
### Looking at the object
AOVModel
## Call:
##    aov(formula = count ~ spray, data = InsectSprays)
## 
## Terms:
##                 spray Residuals
## Sum of Squares   2669      1015
## Deg. of Freedom     5        66
## 
## Residual standard error: 3.922
## Estimated effects may be unbalanced
## Evaluating the differences
summary(AOVModel)
##             Df Sum Sq Mean Sq F value Pr(>F)    
## spray        5   2669     534    34.7 <2e-16 ***
## Residuals   66   1015      15                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### Checking the model tables
model.tables(AOVModel, type = "effects")
## Tables of effects
## 
##  spray 
## spray
##      A      B      C      D      E      F 
##  5.000  5.833 -7.417 -4.583 -6.000  7.167
### Looking at the individual differences
Comparisons <- TukeyHSD(AOVModel)
Comparisons$spray["D-C", ]
##    diff     lwr     upr   p adj 
##  2.8333 -1.8661  7.5327  0.4921
### Plotting the differences
plot(Comparisons, las = 1)
plot of chunk unnamed-chunk-1
## Modeling linear relations Building a linear model
Model <- lm(mpg ~ wt, data = mtcars)
### Extracting information from the model
coef.Model <- coef(Model)
coef.Model
## (Intercept)          wt 
##      37.285      -5.344
plot(mpg ~ wt, data = mtcars)
abline(a = coef.Model[1], b = coef.Model[2])
plot of chunk unnamed-chunk-1
## Evaluating linear models Summarizing the model
Model.summary <- summary(Model)
Model.summary
## 
## Call:
## lm(formula = mpg ~ wt, data = mtcars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.543 -2.365 -0.125  1.410  6.873 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   37.285      1.878   19.86  < 2e-16 ***
## wt            -5.344      0.559   -9.56  1.3e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.05 on 30 degrees of freedom
## Multiple R-squared:  0.753,	Adjusted R-squared:  0.745 
## F-statistic: 91.4 on 1 and 30 DF,  p-value: 1.29e-10
coef(Model.summary)
##             Estimate Std. Error t value  Pr(>|t|)
## (Intercept)   37.285     1.8776  19.858 8.242e-19
## wt            -5.344     0.5591  -9.559 1.294e-10
### Testing the impact of model terms
Model.anova <- anova(Model)
Model.anova
## Analysis of Variance Table
## 
## Response: mpg
##           Df Sum Sq Mean Sq F value  Pr(>F)    
## wt         1    848     848    91.4 1.3e-10 ***
## Residuals 30    278       9                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Model.anova["wt", "Pr(>F)"]
## [1] 1.294e-10
## Predicting new values Getting the values
new.cars <- data.frame(wt = c(1.7, 2.4, 3.6))
predict(Model, newdata = new.cars)
##     1     2     3 
## 28.20 24.46 18.05
### Having confidence in your predictions
predict(Model, newdata = new.cars, interval = "confidence")
##     fit   lwr   upr
## 1 28.20 26.15 30.25
## 2 24.46 23.02 25.90
## 3 18.05 16.86 19.23
predict(Model, newdata = new.cars, interval = "prediction")
##     fit   lwr   upr
## 1 28.20 21.65 34.75
## 2 24.46 18.07 30.84
## 3 18.05 11.71 24.38

The R session information (including the OS info, R version and all packages used):

sessionInfo()
## R version 3.0.2 (2013-09-25)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252  LC_CTYPE=English_United Kingdom.1252   
## [3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] BiocInstaller_1.12.1 ggplot2_0.9.3.1      reshape2_1.2.2       sos_1.3-8           
##  [5] brew_1.0-6           stringr_0.6.2        knitr_1.5            plyr_1.8            
##  [9] Revobase_7.1.0       RevoMods_7.1.0       RevoScaleR_7.1.0     lattice_0.20-27     
## [13] rpart_4.1-2         
## 
## loaded via a namespace (and not attached):
##  [1] codetools_0.2-8    colorspace_1.2-4   dichromat_2.0-0    digest_0.6.4      
##  [5] evaluate_0.5.1     foreach_1.4.1      formatR_0.10       fortunes_1.5-2    
##  [9] grid_3.0.2         gtable_0.1.2       highr_0.3          iterators_1.0.6   
## [13] labeling_0.2       MASS_7.3-29        munsell_0.4.2      proto_0.3-10      
## [17] RColorBrewer_1.0-5 scales_0.2.3       tools_3.0.2
Sys.time()
## [1] "2014-05-13 15:06:16 BST"