This report is automatically generated with the R
    package knitr
    (version 1.5)
    .
# Chapter 5 - Getting Started with Reading and Writing Using Character Vectors for Text # Data Assigning a value to a character vector x <- "Hello world!" is.character(x)
## [1] TRUE
length(x)
## [1] 1
nchar(x)
## [1] 12
## Creating a character vector with more than one element x <- c("Hello", "world!") length(x)
## [1] 2
nchar(x)
## [1] 5 6
## Extracting a subset of a vector letters
## [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u" ## [22] "v" "w" "x" "y" "z"
LETTERS
## [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S" "T" "U" ## [22] "V" "W" "X" "Y" "Z"
letters[10]
## [1] "j"
LETTERS[24:26]
## [1] "X" "Y" "Z"
tail(LETTERS, 5)
## [1] "V" "W" "X" "Y" "Z"
head(letters, 10)
## [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"
## Naming the values in your vectors Looking at how named vectors work str(islands)
## Named num [1:48] 11506 5500 16988 2968 16 ... ## - attr(*, "names")= chr [1:48] "Africa" "Antarctica" "Asia" "Australia" ...
islands[c("Asia", "Africa", "Antarctica")]
## Asia Africa Antarctica ## 16988 11506 5500
names(islands)[1:9]
## [1] "Africa" "Antarctica" "Asia" "Australia" "Axel Heiberg" ## [6] "Baffin" "Banks" "Borneo" "Britain"
names(sort(islands, decreasing = TRUE)[1:6])
## [1] "Asia" "Africa" "North America" "South America" "Antarctica" ## [6] "Europe"
## Creating and assigning named vectors month.days <- c(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) names(month.days) <- month.name month.days
## January February March April May June July August September ## 31 28 31 30 31 30 31 31 30 ## October November December ## 31 30 31
names(month.days[month.days == 31])
## [1] "January" "March" "May" "July" "August" "October" "December"
# Manipulating Text String theory: Combining and splitting strings Splitting text pangram <- "The quick brown fox jumps over the lazy dog" pangram
## [1] "The quick brown fox jumps over the lazy dog"
strsplit(pangram, " ")
## [[1]] ## [1] "The" "quick" "brown" "fox" "jumps" "over" "the" "lazy" "dog"
words <- strsplit(pangram, " ")[[1]] words
## [1] "The" "quick" "brown" "fox" "jumps" "over" "the" "lazy" "dog"
### Changing text case unique(tolower(words))
## [1] "the" "quick" "brown" "fox" "jumps" "over" "lazy" "dog"
toupper(words[c(4, 9)])
## [1] "FOX" "DOG"
tolower("Some TEXT in Mixed CASE")
## [1] "some text in mixed case"
### Concatenating text paste("The", "quick", "brown", "fox")
## [1] "The quick brown fox"
paste(c("The", "quick", "brown", "fox"))
## [1] "The" "quick" "brown" "fox"
paste(words, collapse = " ")
## [1] "The quick brown fox jumps over the lazy dog"
paste(words, collapse = "_")
## [1] "The_quick_brown_fox_jumps_over_the_lazy_dog"
paste(LETTERS[1:5], 1:5, sep = "_", collapse = "---")
## [1] "A_1---B_2---C_3---D_4---E_5"
paste("Sample", 1:5)
## [1] "Sample 1" "Sample 2" "Sample 3" "Sample 4" "Sample 5"
paste(c("A", "B"), c(1, 2, 3, 4), sep = "-")
## [1] "A-1" "B-2" "A-3" "B-4"
paste(c("A"), c(1, 2, 3, 4, 5), sep = "-")
## [1] "A-1" "A-2" "A-3" "A-4" "A-5"
## Sorting text sort(letters, decreasing = TRUE)
## [1] "z" "y" "x" "w" "v" "u" "t" "s" "r" "q" "p" "o" "n" "m" "l" "k" "j" "i" "h" "g" "f" ## [22] "e" "d" "c" "b" "a"
sort(words)
## [1] "brown" "dog" "fox" "jumps" "lazy" "over" "quick" "the" "The"
## Finding text inside text Searching for individual words head(state.names)
## Error: object 'state.names' not found
### Searching by position head(substr(state.name, start = 3, stop = 6))
## [1] "abam" "aska" "izon" "kans" "lifo" "lora"
### Searching by pattern grep("New", state.name)
## [1] 29 30 31 32
state.name[29]
## [1] "New Hampshire"
state.name[grep("New", state.name)]
## [1] "New Hampshire" "New Jersey" "New Mexico" "New York"
state.name[grep("new", state.name)]
## character(0)
### Searching for multiple words state.name[grep(" ", state.name)]
## [1] "New Hampshire" "New Jersey" "New Mexico" "New York" "North Carolina" ## [6] "North Dakota" "Rhode Island" "South Carolina" "South Dakota" "West Virginia"
state.name[grep("East", state.name)]
## character(0)
## Substituting text gsub("cheap", "sheep's", "A wolf in cheap clothing")
## [1] "A wolf in sheep's clothing"
x <- c("file_a.csv", "file_b.csv", "file_c.csv") y <- gsub("file_", "", x) y
## [1] "a.csv" "b.csv" "c.csv"
gsub(".csv", "", y)
## [1] "a" "b" "c"
#### Extending text functionality with stringr install.packages("stringr")
## Error in install.packages : Updating loaded packages
library(stringr) ## Revving up with regular expressions rwords <- c("bach", "back", "beech", "beach", "black") grep("beach|beech", rwords)
## [1] 3 4
rwords[grep("beach|beech", rwords)]
## [1] "beech" "beach"
rwords[grep("be(a|e)ch", rwords)]
## [1] "beech" "beach"
rwords[grep("b(e*|a*)ch", rwords)]
## [1] "bach" "beech"
# Factoring in Factors Creating a factor directions <- c("North", "East", "South", "South") factor(directions)
## [1] North East South South ## Levels: East North South
factor(directions, levels = c("North", "East", "South", "West"))
## [1] North East South South ## Levels: North East South West
factor(directions, levels = c("North", "East", "South", "West"), labels = c("N", "E", "S", "W"))
## [1] N E S S ## Levels: N E S W
## Converting a factor directions <- c("North", "East", "South", "South") directions.factor <- factor(directions) directions.factor
## [1] North East South South ## Levels: East North South
as.character(directions.factor)
## [1] "North" "East" "South" "South"
as.numeric(directions.factor)
## [1] 2 1 3 3
numbers <- factor(c(9, 8, 10, 8, 9)) as.character(numbers)
## [1] "9" "8" "10" "8" "9"
as.numeric(numbers)
## [1] 2 1 3 1 2
as.numeric(as.character(numbers))
## [1] 9 8 10 8 9
## Looking at levels str(state.region)
## Factor w/ 4 levels "Northeast","South",..: 2 4 4 2 4 4 1 2 2 2 ...
levels(state.region)
## [1] "Northeast" "South" "North Central" "West"
levels(state.region) <- c("NE", "S", "NC", "W") head(state.region)
## [1] S W W S W W ## Levels: NE S NC W
nlevels(state.region)
## [1] 4
length(levels(state.region))
## [1] 4
levels(state.region)[2:3]
## [1] "S" "NC"
## Distinguishing data types head(state.region)
## [1] S W W S W W ## Levels: NE S NC W
table(state.region)
## state.region ## NE S NC W ## 9 16 12 13
state.region
## [1] S W W S W W NE S S S W W NC NC NC NC S S NE S NE NC NC S NC W NC W ## [29] NE NE W NE S NC NC S W NE NE S NC S S W NE S W S NC W ## Levels: NE S NC W
## Working with ordered factors status <- c("Lo", "Hi", "Med", "Med", "Hi") ordered.status <- factor(status, levels = c("Lo", "Med", "Hi"), ordered = TRUE) ordered.status
## [1] Lo Hi Med Med Hi ## Levels: Lo < Med < Hi
table(status)
## status ## Hi Lo Med ## 2 1 2
table(ordered.status)
## ordered.status ## Lo Med Hi ## 1 2 2
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 3.0.2 (2013-09-25) ## Platform: x86_64-w64-mingw32/x64 (64-bit) ## ## locale: ## [1] LC_COLLATE=English_United Kingdom.1252 LC_CTYPE=English_United Kingdom.1252 ## [3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C ## [5] LC_TIME=English_United Kingdom.1252 ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] BiocInstaller_1.12.1 ggplot2_0.9.3.1 reshape2_1.2.2 sos_1.3-8 ## [5] brew_1.0-6 stringr_0.6.2 knitr_1.5 plyr_1.8 ## [9] Revobase_7.1.0 RevoMods_7.1.0 RevoScaleR_7.1.0 lattice_0.20-27 ## [13] rpart_4.1-2 ## ## loaded via a namespace (and not attached): ## [1] codetools_0.2-8 colorspace_1.2-4 dichromat_2.0-0 digest_0.6.4 ## [5] evaluate_0.5.1 foreach_1.4.1 formatR_0.10 fortunes_1.5-2 ## [9] grid_3.0.2 gtable_0.1.2 highr_0.3 iterators_1.0.6 ## [13] labeling_0.2 MASS_7.3-29 munsell_0.4.2 proto_0.3-10 ## [17] RColorBrewer_1.0-5 scales_0.2.3 tools_3.0.2
Sys.time()
## [1] "2014-05-13 15:05:35 BST"