This report is automatically generated with the R
package knitr
(version 1.5
)
.
# Chapter 14 Summarizing Data Starting with the Right Data Using factors or numeric data # Counting unique values sapply(mtcars, function(x) length(unique(x)))
## mpg cyl disp hp drat wt qsec vs am gear carb ## 25 3 27 22 22 29 30 2 2 3 6
## Preparing the data cars <- mtcars[c(1, 2, 9, 10)] cars$gear <- ordered(cars$gear) cars$am <- factor(cars$am, labels = c("auto", "manual")) str(cars)
## 'data.frame': 32 obs. of 4 variables: ## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ... ## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ... ## $ am : Factor w/ 2 levels "auto","manual": 2 2 2 1 1 1 1 1 1 1 ... ## $ gear: Ord.factor w/ 3 levels "3"<"4"<"5": 2 2 2 1 1 1 1 2 2 2 ...
# Describing Continuous Variables Talking about the center of your data mean(cars$mpg)
## [1] 20.09
median(cars$cyl)
## [1] 6
## Describing the variation sd(cars$mpg)
## [1] 6.027
## Checking the quantiles Calculating the range range(cars$mpg)
## [1] 10.4 33.9
### Calculating the quantiles quantile(cars$mpg)
## 0% 25% 50% 75% 100% ## 10.40 15.43 19.20 22.80 33.90
### Getting on speed with the quantile function quantile(cars$mpg, probs = c(0.05, 0.95))
## 5% 95% ## 12.0 31.3
# Describing Categories Counting appearances Creating a table amtable <- table(cars$am) amtable
## ## auto manual ## 19 13
### Working with tables Calculating proportions amtable/sum(amtable)
## ## auto manual ## 0.5938 0.4062
prop.table(amtable)
## ## auto manual ## 0.5938 0.4062
## Finding the center id <- amtable == max(amtable) names(amtable)[id]
## [1] "auto"
# Describing Distributions Plotting histograms Making the plot hist(cars$mpg, col = "grey")
### Playing with breaks hist(cars$mpg, breaks = c(5, 15, 25, 35))
## Using frequencies or densities Creating a density plot mpgdens <- density(cars$mpg) plot(mpgdens)
### Plotting densities in a histogram hist(cars$mpg, col = "grey", freq = FALSE) lines(mpgdens)
# Describing Multiple Variables Summarizing a complete dataset Getting the output summary(cars)
## mpg cyl am gear ## Min. :10.4 Min. :4.00 auto :19 3:15 ## 1st Qu.:15.4 1st Qu.:4.00 manual:13 4:12 ## Median :19.2 Median :6.00 5: 5 ## Mean :20.1 Mean :6.19 ## 3rd Qu.:22.8 3rd Qu.:8.00 ## Max. :33.9 Max. :8.00
### Fixing a problem cars$cyl <- as.factor(cars$cyl) ## Plotting quantiles for subgroups boxplot(mpg ~ cyl, data = cars)
## Tracking correlations names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
### Looking at relations plot(iris[-5])
### Getting the numbers with(iris, cor(Petal.Width, Petal.Length))
## [1] 0.9629
### Calculating correlations for multiple variables iris.cor <- cor(iris[-5]) str(iris.cor)
## num [1:4, 1:4] 1 -0.118 0.872 0.818 -0.118 ... ## - attr(*, "dimnames")=List of 2 ## ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" ## ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
iris.cor["Petal.Width", "Petal.Length"]
## [1] 0.9629
### Dealing with missing values Working with Tables Creating a two-way table Creating a table ### from two variables with(cars, table(am, gear))
## gear ## am 3 4 5 ## auto 15 4 0 ## manual 0 8 5
### Creating tables from a matrix trial <- matrix(c(34, 11, 9, 32), ncol = 2) colnames(trial) <- c("sick", "healthy") rownames(trial) <- c("risk", "no_risk") trial.table <- as.table(trial) trial.table
## sick healthy ## risk 34 9 ## no_risk 11 32
### Extracting the numbers trial.table["risk", "sick"]
## [1] 34
## Converting tables to a data frame trial.df <- as.data.frame(trial) str(trial.df)
## 'data.frame': 2 obs. of 2 variables: ## $ sick : num 34 11 ## $ healthy: num 9 32
trial.table.df <- as.data.frame(trial.table) str(trial.table.df)
## 'data.frame': 4 obs. of 3 variables: ## $ Var1: Factor w/ 2 levels "risk","no_risk": 1 2 1 2 ## $ Var2: Factor w/ 2 levels "sick","healthy": 1 1 2 2 ## $ Freq: num 34 11 9 32
## Looking at margins and proportions Adding margins to the table addmargins(trial.table)
## sick healthy Sum ## risk 34 9 43 ## no_risk 11 32 43 ## Sum 45 41 86
addmargins(trial.table, margin = 2)
## sick healthy Sum ## risk 34 9 43 ## no_risk 11 32 43
### Calculating proportions prop.table(trial.table)
## sick healthy ## risk 0.3953 0.1047 ## no_risk 0.1279 0.3721
### Calculating proportions over columns and rows prop.table(trial.table, margin = 1)
## sick healthy ## risk 0.7907 0.2093 ## no_risk 0.2558 0.7442
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 3.0.2 (2013-09-25) ## Platform: x86_64-w64-mingw32/x64 (64-bit) ## ## locale: ## [1] LC_COLLATE=English_United Kingdom.1252 LC_CTYPE=English_United Kingdom.1252 ## [3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C ## [5] LC_TIME=English_United Kingdom.1252 ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] BiocInstaller_1.12.1 ggplot2_0.9.3.1 reshape2_1.2.2 sos_1.3-8 ## [5] brew_1.0-6 stringr_0.6.2 knitr_1.5 plyr_1.8 ## [9] Revobase_7.1.0 RevoMods_7.1.0 RevoScaleR_7.1.0 lattice_0.20-27 ## [13] rpart_4.1-2 ## ## loaded via a namespace (and not attached): ## [1] codetools_0.2-8 colorspace_1.2-4 dichromat_2.0-0 digest_0.6.4 ## [5] evaluate_0.5.1 foreach_1.4.1 formatR_0.10 fortunes_1.5-2 ## [9] grid_3.0.2 gtable_0.1.2 highr_0.3 iterators_1.0.6 ## [13] labeling_0.2 MASS_7.3-29 munsell_0.4.2 proto_0.3-10 ## [17] RColorBrewer_1.0-5 scales_0.2.3 tools_3.0.2
Sys.time()
## [1] "2014-05-13 15:06:14 BST"