This report is automatically generated with the R package knitr (version 1.5) .

# Chapter 14 Summarizing Data Starting with the Right Data Using factors or numeric data
# Counting unique values
sapply(mtcars, function(x) length(unique(x)))
##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##   25    3   27   22   22   29   30    2    2    3    6
## Preparing the data
cars <- mtcars[c(1, 2, 9, 10)]
cars$gear <- ordered(cars$gear)
cars$am <- factor(cars$am, labels = c("auto", "manual"))
str(cars)
## 'data.frame':	32 obs. of  4 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ am  : Factor w/ 2 levels "auto","manual": 2 2 2 1 1 1 1 1 1 1 ...
##  $ gear: Ord.factor w/ 3 levels "3"<"4"<"5": 2 2 2 1 1 1 1 2 2 2 ...
# Describing Continuous Variables Talking about the center of your data
mean(cars$mpg)
## [1] 20.09
median(cars$cyl)
## [1] 6
## Describing the variation
sd(cars$mpg)
## [1] 6.027
## Checking the quantiles Calculating the range
range(cars$mpg)
## [1] 10.4 33.9
### Calculating the quantiles
quantile(cars$mpg)
##    0%   25%   50%   75%  100% 
## 10.40 15.43 19.20 22.80 33.90
### Getting on speed with the quantile function
quantile(cars$mpg, probs = c(0.05, 0.95))
##   5%  95% 
## 12.0 31.3
# Describing Categories Counting appearances Creating a table
amtable <- table(cars$am)
amtable
## 
##   auto manual 
##     19     13
### Working with tables Calculating proportions
amtable/sum(amtable)
## 
##   auto manual 
## 0.5938 0.4062
prop.table(amtable)
## 
##   auto manual 
## 0.5938 0.4062
## Finding the center
id <- amtable == max(amtable)
names(amtable)[id]
## [1] "auto"
# Describing Distributions Plotting histograms Making the plot
hist(cars$mpg, col = "grey")
plot of chunk unnamed-chunk-1
### Playing with breaks
hist(cars$mpg, breaks = c(5, 15, 25, 35))
plot of chunk unnamed-chunk-1
## Using frequencies or densities Creating a density plot
mpgdens <- density(cars$mpg)
plot(mpgdens)
plot of chunk unnamed-chunk-1
### Plotting densities in a histogram
hist(cars$mpg, col = "grey", freq = FALSE)
lines(mpgdens)
plot of chunk unnamed-chunk-1
# Describing Multiple Variables Summarizing a complete dataset Getting the output
summary(cars)
##       mpg            cyl            am     gear  
##  Min.   :10.4   Min.   :4.00   auto  :19   3:15  
##  1st Qu.:15.4   1st Qu.:4.00   manual:13   4:12  
##  Median :19.2   Median :6.00               5: 5  
##  Mean   :20.1   Mean   :6.19                     
##  3rd Qu.:22.8   3rd Qu.:8.00                     
##  Max.   :33.9   Max.   :8.00
### Fixing a problem
cars$cyl <- as.factor(cars$cyl)
## Plotting quantiles for subgroups
boxplot(mpg ~ cyl, data = cars)
plot of chunk unnamed-chunk-1
## Tracking correlations
names(iris)
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"
### Looking at relations
plot(iris[-5])
plot of chunk unnamed-chunk-1
### Getting the numbers
with(iris, cor(Petal.Width, Petal.Length))
## [1] 0.9629
### Calculating correlations for multiple variables
iris.cor <- cor(iris[-5])
str(iris.cor)
##  num [1:4, 1:4] 1 -0.118 0.872 0.818 -0.118 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##   ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
iris.cor["Petal.Width", "Petal.Length"]
## [1] 0.9629
### Dealing with missing values Working with Tables Creating a two-way table Creating a table
### from two variables
with(cars, table(am, gear))
##         gear
## am        3  4  5
##   auto   15  4  0
##   manual  0  8  5
### Creating tables from a matrix
trial <- matrix(c(34, 11, 9, 32), ncol = 2)
colnames(trial) <- c("sick", "healthy")
rownames(trial) <- c("risk", "no_risk")
trial.table <- as.table(trial)
trial.table
##         sick healthy
## risk      34       9
## no_risk   11      32
### Extracting the numbers
trial.table["risk", "sick"]
## [1] 34
## Converting tables to a data frame
trial.df <- as.data.frame(trial)
str(trial.df)
## 'data.frame':	2 obs. of  2 variables:
##  $ sick   : num  34 11
##  $ healthy: num  9 32
trial.table.df <- as.data.frame(trial.table)
str(trial.table.df)
## 'data.frame':	4 obs. of  3 variables:
##  $ Var1: Factor w/ 2 levels "risk","no_risk": 1 2 1 2
##  $ Var2: Factor w/ 2 levels "sick","healthy": 1 1 2 2
##  $ Freq: num  34 11 9 32
## Looking at margins and proportions Adding margins to the table
addmargins(trial.table)
##         sick healthy Sum
## risk      34       9  43
## no_risk   11      32  43
## Sum       45      41  86
addmargins(trial.table, margin = 2)
##         sick healthy Sum
## risk      34       9  43
## no_risk   11      32  43
### Calculating proportions
prop.table(trial.table)
##           sick healthy
## risk    0.3953  0.1047
## no_risk 0.1279  0.3721
### Calculating proportions over columns and rows
prop.table(trial.table, margin = 1)
##           sick healthy
## risk    0.7907  0.2093
## no_risk 0.2558  0.7442

The R session information (including the OS info, R version and all packages used):

sessionInfo()
## R version 3.0.2 (2013-09-25)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252  LC_CTYPE=English_United Kingdom.1252   
## [3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] BiocInstaller_1.12.1 ggplot2_0.9.3.1      reshape2_1.2.2       sos_1.3-8           
##  [5] brew_1.0-6           stringr_0.6.2        knitr_1.5            plyr_1.8            
##  [9] Revobase_7.1.0       RevoMods_7.1.0       RevoScaleR_7.1.0     lattice_0.20-27     
## [13] rpart_4.1-2         
## 
## loaded via a namespace (and not attached):
##  [1] codetools_0.2-8    colorspace_1.2-4   dichromat_2.0-0    digest_0.6.4      
##  [5] evaluate_0.5.1     foreach_1.4.1      formatR_0.10       fortunes_1.5-2    
##  [9] grid_3.0.2         gtable_0.1.2       highr_0.3          iterators_1.0.6   
## [13] labeling_0.2       MASS_7.3-29        munsell_0.4.2      proto_0.3-10      
## [17] RColorBrewer_1.0-5 scales_0.2.3       tools_3.0.2
Sys.time()
## [1] "2014-05-13 15:06:14 BST"