This report is automatically generated with the R package knitr (version 1.5) .

# Chapter 5 - Getting Started with Reading and Writing Using Character Vectors for Text
# Data Assigning a value to a character vector
x <- "Hello world!"
is.character(x)
## [1] TRUE
length(x)
## [1] 1
nchar(x)
## [1] 12
## Creating a character vector with more than one element
x <- c("Hello", "world!")
length(x)
## [1] 2
nchar(x)
## [1] 5 6
## Extracting a subset of a vector
letters
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u"
## [22] "v" "w" "x" "y" "z"
LETTERS
##  [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S" "T" "U"
## [22] "V" "W" "X" "Y" "Z"
letters[10]
## [1] "j"
LETTERS[24:26]
## [1] "X" "Y" "Z"
tail(LETTERS, 5)
## [1] "V" "W" "X" "Y" "Z"
head(letters, 10)
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"
## Naming the values in your vectors Looking at how named vectors work
str(islands)
##  Named num [1:48] 11506 5500 16988 2968 16 ...
##  - attr(*, "names")= chr [1:48] "Africa" "Antarctica" "Asia" "Australia" ...
islands[c("Asia", "Africa", "Antarctica")]
##       Asia     Africa Antarctica 
##      16988      11506       5500
names(islands)[1:9]
## [1] "Africa"       "Antarctica"   "Asia"         "Australia"    "Axel Heiberg"
## [6] "Baffin"       "Banks"        "Borneo"       "Britain"
names(sort(islands, decreasing = TRUE)[1:6])
## [1] "Asia"          "Africa"        "North America" "South America" "Antarctica"   
## [6] "Europe"
## Creating and assigning named vectors
month.days <- c(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)
names(month.days) <- month.name
month.days
##   January  February     March     April       May      June      July    August September 
##        31        28        31        30        31        30        31        31        30 
##   October  November  December 
##        31        30        31
names(month.days[month.days == 31])
## [1] "January"  "March"    "May"      "July"     "August"   "October"  "December"
# Manipulating Text String theory: Combining and splitting strings Splitting text
pangram <- "The quick brown fox jumps over the lazy dog"
pangram
## [1] "The quick brown fox jumps over the lazy dog"
strsplit(pangram, " ")
## [[1]]
## [1] "The"   "quick" "brown" "fox"   "jumps" "over"  "the"   "lazy"  "dog"
words <- strsplit(pangram, " ")[[1]]
words
## [1] "The"   "quick" "brown" "fox"   "jumps" "over"  "the"   "lazy"  "dog"
### Changing text case
unique(tolower(words))
## [1] "the"   "quick" "brown" "fox"   "jumps" "over"  "lazy"  "dog"
toupper(words[c(4, 9)])
## [1] "FOX" "DOG"
tolower("Some TEXT in Mixed CASE")
## [1] "some text in mixed case"
### Concatenating text
paste("The", "quick", "brown", "fox")
## [1] "The quick brown fox"
paste(c("The", "quick", "brown", "fox"))
## [1] "The"   "quick" "brown" "fox"
paste(words, collapse = " ")
## [1] "The quick brown fox jumps over the lazy dog"
paste(words, collapse = "_")
## [1] "The_quick_brown_fox_jumps_over_the_lazy_dog"
paste(LETTERS[1:5], 1:5, sep = "_", collapse = "---")
## [1] "A_1---B_2---C_3---D_4---E_5"
paste("Sample", 1:5)
## [1] "Sample 1" "Sample 2" "Sample 3" "Sample 4" "Sample 5"
paste(c("A", "B"), c(1, 2, 3, 4), sep = "-")
## [1] "A-1" "B-2" "A-3" "B-4"
paste(c("A"), c(1, 2, 3, 4, 5), sep = "-")
## [1] "A-1" "A-2" "A-3" "A-4" "A-5"
## Sorting text
sort(letters, decreasing = TRUE)
##  [1] "z" "y" "x" "w" "v" "u" "t" "s" "r" "q" "p" "o" "n" "m" "l" "k" "j" "i" "h" "g" "f"
## [22] "e" "d" "c" "b" "a"
sort(words)
## [1] "brown" "dog"   "fox"   "jumps" "lazy"  "over"  "quick" "the"   "The"
## Finding text inside text Searching for individual words
head(state.names)
## Error: object 'state.names' not found
### Searching by position
head(substr(state.name, start = 3, stop = 6))
## [1] "abam" "aska" "izon" "kans" "lifo" "lora"
### Searching by pattern
grep("New", state.name)
## [1] 29 30 31 32
state.name[29]
## [1] "New Hampshire"
state.name[grep("New", state.name)]
## [1] "New Hampshire" "New Jersey"    "New Mexico"    "New York"
state.name[grep("new", state.name)]
## character(0)
### Searching for multiple words
state.name[grep(" ", state.name)]
##  [1] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"       "North Carolina"
##  [6] "North Dakota"   "Rhode Island"   "South Carolina" "South Dakota"   "West Virginia"
state.name[grep("East", state.name)]
## character(0)
## Substituting text
gsub("cheap", "sheep's", "A wolf in cheap clothing")
## [1] "A wolf in sheep's clothing"
x <- c("file_a.csv", "file_b.csv", "file_c.csv")
y <- gsub("file_", "", x)
y
## [1] "a.csv" "b.csv" "c.csv"
gsub(".csv", "", y)
## [1] "a" "b" "c"
#### Extending text functionality with stringr
install.packages("stringr")
## Error in install.packages : Updating loaded packages
library(stringr)
## Revving up with regular expressions
rwords <- c("bach", "back", "beech", "beach", "black")
grep("beach|beech", rwords)
## [1] 3 4
rwords[grep("beach|beech", rwords)]
## [1] "beech" "beach"
rwords[grep("be(a|e)ch", rwords)]
## [1] "beech" "beach"
rwords[grep("b(e*|a*)ch", rwords)]
## [1] "bach"  "beech"
# Factoring in Factors Creating a factor
directions <- c("North", "East", "South", "South")
factor(directions)
## [1] North East  South South
## Levels: East North South
factor(directions, levels = c("North", "East", "South", "West"))
## [1] North East  South South
## Levels: North East South West
factor(directions, levels = c("North", "East", "South", "West"), labels = c("N", "E", "S",
    "W"))
## [1] N E S S
## Levels: N E S W
## Converting a factor
directions <- c("North", "East", "South", "South")
directions.factor <- factor(directions)
directions.factor
## [1] North East  South South
## Levels: East North South
as.character(directions.factor)
## [1] "North" "East"  "South" "South"
as.numeric(directions.factor)
## [1] 2 1 3 3
numbers <- factor(c(9, 8, 10, 8, 9))
as.character(numbers)
## [1] "9"  "8"  "10" "8"  "9"
as.numeric(numbers)
## [1] 2 1 3 1 2
as.numeric(as.character(numbers))
## [1]  9  8 10  8  9
## Looking at levels
str(state.region)
##  Factor w/ 4 levels "Northeast","South",..: 2 4 4 2 4 4 1 2 2 2 ...
levels(state.region)
## [1] "Northeast"     "South"         "North Central" "West"
levels(state.region) <- c("NE", "S", "NC", "W")
head(state.region)
## [1] S W W S W W
## Levels: NE S NC W
nlevels(state.region)
## [1] 4
length(levels(state.region))
## [1] 4
levels(state.region)[2:3]
## [1] "S"  "NC"
## Distinguishing data types
head(state.region)
## [1] S W W S W W
## Levels: NE S NC W
table(state.region)
## state.region
## NE  S NC  W 
##  9 16 12 13
state.region
##  [1] S  W  W  S  W  W  NE S  S  S  W  W  NC NC NC NC S  S  NE S  NE NC NC S  NC W  NC W 
## [29] NE NE W  NE S  NC NC S  W  NE NE S  NC S  S  W  NE S  W  S  NC W 
## Levels: NE S NC W
## Working with ordered factors
status <- c("Lo", "Hi", "Med", "Med", "Hi")
ordered.status <- factor(status, levels = c("Lo", "Med", "Hi"), ordered = TRUE)
ordered.status
## [1] Lo  Hi  Med Med Hi 
## Levels: Lo < Med < Hi
table(status)
## status
##  Hi  Lo Med 
##   2   1   2
table(ordered.status)
## ordered.status
##  Lo Med  Hi 
##   1   2   2

The R session information (including the OS info, R version and all packages used):

sessionInfo()
## R version 3.0.2 (2013-09-25)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252  LC_CTYPE=English_United Kingdom.1252   
## [3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] BiocInstaller_1.12.1 ggplot2_0.9.3.1      reshape2_1.2.2       sos_1.3-8           
##  [5] brew_1.0-6           stringr_0.6.2        knitr_1.5            plyr_1.8            
##  [9] Revobase_7.1.0       RevoMods_7.1.0       RevoScaleR_7.1.0     lattice_0.20-27     
## [13] rpart_4.1-2         
## 
## loaded via a namespace (and not attached):
##  [1] codetools_0.2-8    colorspace_1.2-4   dichromat_2.0-0    digest_0.6.4      
##  [5] evaluate_0.5.1     foreach_1.4.1      formatR_0.10       fortunes_1.5-2    
##  [9] grid_3.0.2         gtable_0.1.2       highr_0.3          iterators_1.0.6   
## [13] labeling_0.2       MASS_7.3-29        munsell_0.4.2      proto_0.3-10      
## [17] RColorBrewer_1.0-5 scales_0.2.3       tools_3.0.2
Sys.time()
## [1] "2014-05-13 15:05:35 BST"