quote <- "The most valuable thing you can have as a leader is clear data"author <- "Ruth Porat"
paste()
function under Base R is used for creating and building strings. str_c()
is equivalent to the paste()
function. paste(quote, "by", author)
## [1] "The most valuable thing you can have as a leader is clear data by Ruth Porat"
paste0()
to paste without spaces between characters.paste0("I", "love", "Data", "Preprocessing")
## [1] "IloveDataPreprocessing"
is.character()
and any other data format can be converted into strings/characters with as.character()
.is.character(quote)
## [1] TRUE
as.character(pi)
## [1] "3.14159265358979"
Printing strings/characters can be done with the following:
Function | Usage |
---|---|
print() |
generic printing |
noquote() |
print with no quotes |
cat() |
concatenate and print with no quotes (no line number) |
# print without quotesprint( paste(quote,author) , quote = FALSE)
## [1] The most valuable thing you can have as a leader is clear data Ruth Porat
# same as above, only difference `cat()` does not print the numeric line indicatorcat( paste(quote,author) )
## The most valuable thing you can have as a leader is clear data Ruth Porat
# basic printing of alphabetcat(letters)
## a b c d e f g h i j k l m n o p q r s t u v w x y z
# specify a seperator between the combined characterscat(letters, sep = "-")
## a-b-c-d-e-f-g-h-i-j-k-l-m-n-o-p-q-r-s-t-u-v-w-x-y-z
fill
argument.# No breaks between linescat(quote, author, fill = FALSE)
## The most valuable thing you can have as a leader is clear data Ruth Porat
# Breaks between linescat(letters, letters, letters, fill = TRUE)
## a b c d e f g h i j k l m n o p q r s t u v w x y z a b c d e f g h i j k l m n ## o p q r s t u v w x y z a b c d e f g h i j k l m n o p q r s t u v w x y z
length("How many elements are in this string?")
## [1] 1
length( c("How", "many", "elements", "are", "in", "this", "string?") )
## [1] 7
nchar()
.nchar("How many characters are in this string?")
## [1] 39
nchar(c("How", "many", "characters", "are", "in", "this", "string?"))
## [1] 3 4 10 3 2 4 7
Basic string manipulation typically includes:
These operations can all be performed with base R functions; however, some operations are greatly simplified with the stringr
package.
tolower()
.toupper()
.a <- "MATH2349 is AWesomE"tolower(a)
## [1] "math2349 is awesome"
toupper(a)
## [1] "MATH2349 IS AWESOME"
chartr()
.# replace 'z' with 's'american <- "This is how we analyze."chartr(old = "z", new = "s", american)
## [1] "This is how we analyse."
# replace 'i' with 'w', 'X' with 'h' and 's' with 'y'x <- "MiXeD cAsE 123"chartr(old ="iXs", new ="why", x)
## [1] "MwheD cAyE 123"
gsub()
.# replace "ot" pattern with "ut"x <- "R Totorial"gsub(pattern = "ot", replacement="ut", x)
## [1] "R Tutorial"
abbreviate()
.streets <- c("Victoria", "Yarra", "Russell", "Williams", "Swanston")# default abbreviationsabbreviate(streets)
## Victoria Yarra Russell Williams Swanston ## "Vctr" "Yarr" "Rssl" "Wllm" "Swns"
# set minimum length of abbreviationabbreviate(streets, minlength = 2)
## Victoria Yarra Russell Williams Swanston ## "Vc" "Yr" "Rs" "Wl" "Sw"
substr()
is to extract and replace substrings with specified starting and stopping characters.alphabet <- paste(LETTERS, collapse = "")alphabet
## [1] "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
# extract 18-24th characters in alphabetsubstr(alphabet, start = 18, stop = 24)
## [1] "RSTUVWX"
# replace 19-24th characters with `R`substr(alphabet, start = 19, stop = 24) <- "RRRRRR"alphabet
## [1] "ABCDEFGHIJKLMNOPQRRRRRRRYZ"
strsplit()
.z <- "Victoria Yarra Russell Williams Swanston"strsplit(z, split = " ")
## [[1]]## [1] "Victoria" "Yarra" "Russell" "Williams" "Swanston"
a <- "Victoria-Yarra-Russell-Williams-Swanston"strsplit(a, split = "-")
## [[1]]## [1] "Victoria" "Yarra" "Russell" "Williams" "Swanston"
strsplit()
is a list. To convert the output to a simple atomic vector simply use unlist()
. unlist(strsplit(a, split = "-"))
## [1] "Victoria" "Yarra" "Russell" "Williams" "Swanston"
Function | Usage |
---|---|
union() |
obtain union between two character vectors |
intersect() |
obtain the common elements of two character vectors |
setdiff() |
obtain the non-common elements, or the difference |
setequal() |
tests if two vectors contain the same elements regardless of order |
identical() |
tests if two character vectors are equal in content and order |
set_1 <- c("VIC", "NSW", "WA", "TAS")set_2 <- c("TAS", "QLD", "SA", "NSW")union(set_1, set_2)
## [1] "VIC" "NSW" "WA" "TAS" "QLD" "SA"
intersect(set_1, set_2)
## [1] "NSW" "TAS"
setdiff(set_1, set_2)
## [1] "VIC" "WA"
The stringr
package was developed by Hadley Wickham to provide a consistent and simple wrappers to common string operations.
These functions are closely related to their base R equivalents:
Concatenate with str_c()
( ∼ paste()
and paste0()
).
Number of characters with str_length()
( ∼ nchar()
).
Substring with str_sub()
( ∼ substr()
).
stringr
provides a new functionality using str_dup()
in which base R does not have a specific function for character duplication. str_dup("apples", times = 4)
## [1] "applesapplesapplesapples"
str_dup("apples", times = 1:4)
## [1] "apples" "applesapples" ## [3] "applesapplesapples" "applesapplesapplesapples"
In string processing, a common task is parsing text into individual words.
Often, this results in words having blank spaces (white spaces) on either end of the word. The str_trim()
can be used to remove these spaces.
text <- c("Text ", " with", " whitespace ", " on", "both ", " sides ")text
## [1] "Text " " with" " whitespace " " on" "both " ## [6] " sides "
str_trim(text, side = "left")
## [1] "Text " "with" "whitespace " "on" "both " ## [6] "sides "
str_trim(text, side = "right")
## [1] "Text" " with" " whitespace" " on" "both" ## [6] " sides"
str_trim(text, side = "both")
## [1] "Text" "with" "whitespace" "on" "both" ## [6] "sides"
str_pad()
. str_pad("apples", width = 10, side = "left")
## [1] " apples"
str_pad("apples", width = 10, side = "both")
## [1] " apples "
str_pad()
to pad a string with specified characters. The width
argument will give width of padded strings and the pad
argument will specify the padding characters. str_pad("apples", width = 10, side = "right", pad = "!")
## [1] "apples!!!!"
The vast majority of string manipulations require pattern matching for a given text.
Good news is, stringr
package has pattern matching functions to detect, subset, locate, count, extract, and replace strings.
str_detect()
detects the presence or absence of a pattern and returns a logical vector. # detects pattern "ea"x <- c("apple", "banana", "pear")str_detect(x, pattern ="ea")
## [1] FALSE FALSE TRUE
#same as abovestr_detect(x, "ea")
## [1] FALSE FALSE TRUE
While matching patterns, you can also use the regular expressions.
Regular expressions (a.k.a. regex's) are a language that allow you to describe patterns in strings.
# Same as above using regexx <- c("apple", "banana", "pear")str_detect(x, regex("ea"))
## [1] FALSE FALSE TRUE
ignore_case = TRUE
.bananas <- c("banana", "Banana", "BANANA")#case insensitive matchstr_detect(bananas, regex("banana",ignore_case = TRUE))
## [1] TRUE TRUE TRUE
[ ]
. For example:[abc]
: matches a, b, or c.[a-z]
: matches every character between a and z (in Unicode code point order).[^abc]
: matches anything except a, b, or c.[\^\-]
: matches ^ or -.They take a little while to get your head around, but once you understand them, you’ll find them extremely useful.
For more information on the regex capabilities, please refer to regular expressions vignette under stringr package.
[ ]
:[:punct:]
: punctuation.[:alpha:]
: letters.[:lower:]
: lowercase letters.[:upper:]
: upperclass letters.[:digit:]
: digits.[:xdigit:]
: hex digits.[:alnum:]
: letters and numbers.[:cntrl:]
: control characters.[:graph:]
: letters, numbers, and punctuation.[:print:]
: letters, numbers, punctuation, and white space.[:space:]
: space characters (basically equivalent to \s).[:blank:]
: space and tab.library(stringr)head(words)
## [1] "a" "able" "about" "absolute" "accept" "account"
length(words)
## [1] 980
Task 1. Find out how many words have "ing" pattern?
Task 2. Find out how many words end in "ing"? Hint: (Use anchors)[https://stringr.tidyverse.org/articles/regular-expressions.html#anchors] for this.
Task 3. Find out which words end with "ing"?
#Task 1:str_detect(words, pattern = regex("ing")) %>% sum()
## [1] 10
# Same as above:str_detect(words, "ing") %>% sum()
## [1] 10
# Task 2:str_detect(words, "ing$") %>% sum()
## [1] 9
# Task 3:words[str_detect(words, "ing$")]
## [1] "bring" "during" "evening" "king" "meaning" "morning" "ring" ## [8] "sing" "thing"
str_subset()
returns the elements of a character vector that match a regular expression.
Using starwars
data set, let's subset the character names that contain any punctuation.
head(starwars$name)
## [1] "Luke Skywalker" "C-3PO" "R2-D2" "Darth Vader" ## [5] "Leia Organa" "Owen Lars"
str_subset(starwars$name, "[:punct:]")
## [1] "C-3PO" "R2-D2" "R5-D4" "Obi-Wan Kenobi"## [5] "IG-88" "Qui-Gon Jinn" "Ki-Adi-Mundi" "R4-P17"
str_extract()
extracts text corresponding to the first match, returning a character vector.str_extract(starwars$name, "[:punct:]")
## [1] NA "-" "-" NA NA NA NA "-" NA "-" NA NA NA NA NA NA NA NA NA ## [20] NA NA "-" NA NA NA NA NA NA NA NA "-" NA NA NA NA NA NA NA ## [39] NA NA NA NA NA NA NA NA NA NA NA NA "-" NA NA NA NA NA NA ## [58] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA "-" NA NA ## [77] NA NA NA NA NA NA NA NA NA NA NA
str_locate()
locates the first position of a pattern and returns a numeric matrix with columns start and end whereas str_locate_all()
locates all positions of a given pattern.str_locate(starwars$name, "[:punct:]") %>% head()
## start end## [1,] NA NA## [2,] 2 2## [3,] 3 3## [4,] NA NA## [5,] NA NA## [6,] NA NA
str_count()
counts the number of matches for a given string.str_count(starwars$name, "[:punct:]")
## [1] 0 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0## [39] 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0## [77] 0 0 0 0 0 0 0 0 0 0 0
str_replace()
replaces a string with another one.
The pattern
argument will give the string that is going to be replaced and replacement
argument will specify the replacement string.
head(fruit)
## [1] "apple" "apricot" "avocado" "banana" "bell pepper"## [6] "bilberry"
# Replace berry with berriesstr_replace(fruit, pattern = "berry", replacement = "berries")
## [1] "apple" "apricot" "avocado" ## [4] "banana" "bell pepper" "bilberries" ## [7] "blackberries" "blackcurrant" "blood orange" ## [10] "blueberries" "boysenberries" "breadfruit" ## [13] "canary melon" "cantaloupe" "cherimoya" ## [16] "cherry" "chili pepper" "clementine" ## [19] "cloudberries" "coconut" "cranberries" ## [22] "cucumber" "currant" "damson" ## [25] "date" "dragonfruit" "durian" ## [28] "eggplant" "elderberries" "feijoa" ## [31] "fig" "goji berries" "gooseberries" ## [34] "grape" "grapefruit" "guava" ## [37] "honeydew" "huckleberries" "jackfruit" ## [40] "jambul" "jujube" "kiwi fruit" ## [43] "kumquat" "lemon" "lime" ## [46] "loquat" "lychee" "mandarine" ## [49] "mango" "mulberries" "nectarine" ## [52] "nut" "olive" "orange" ## [55] "pamelo" "papaya" "passionfruit" ## [58] "peach" "pear" "persimmon" ## [61] "physalis" "pineapple" "plum" ## [64] "pomegranate" "pomelo" "purple mangosteen"## [67] "quince" "raisin" "rambutan" ## [70] "raspberries" "redcurrant" "rock melon" ## [73] "salal berries" "satsuma" "star fruit" ## [76] "strawberries" "tamarillo" "tangerine" ## [79] "ugli fruit" "watermelon"
#replace first l with "" (delete first l)str_replace("Hello world", pattern = "l", replacement = "")
## [1] "Helo world"
# replace all l's with "" (delete l's)str_replace_all("Hello world", pattern = "l", replacement = "")
## [1] "Heo word"
String manipulations using BaseR and stringr
.
Usage of regular expressions.
Pattern matching functions.
Practice!
quote <- "The most valuable thing you can have as a leader is clear data"author <- "Ruth Porat"
paste()
function under Base R is used for creating and building strings. str_c()
is equivalent to the paste()
function. paste(quote, "by", author)
## [1] "The most valuable thing you can have as a leader is clear data by Ruth Porat"
paste0()
to paste without spaces between characters.paste0("I", "love", "Data", "Preprocessing")
## [1] "IloveDataPreprocessing"
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
Esc | Back to slideshow |