R for Data Science(二)

R for Data Science(二)

本文涵盖了R for Data Science一书第13章(Relational data)到第21章(Iteration)的内容。

关系型数据

R
1
2
3
4
5
6
7
8
9
10
11
12
13
library(tidyverse)
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
3, "x3"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
4, "y3"
)

内连接:只保留共同部分

R
1
2
3
4
5
6
7
8
9
x %>%
inner_join(y)

Joining, by = "key"
# A tibble: 2 x 3
key val_x val_y
<dbl> <chr> <chr>
1 1 x1 y1
2 2 x2 y2

外连接:分为三种类型

  • 左连接:以左表为主表;
  • 右连接:以右表为主表;
  • 全连接:连接的结果会保留两个表的所有观测值。

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
> x %>%
+ left_join(y)
Joining, by = "key"
# A tibble: 3 x 3
key val_x val_y
<dbl> <chr> <chr>
1 1 x1 y1
2 2 x2 y2
3 3 x3 NA

> x %>%
+ right_join(y)
Joining, by = "key"
# A tibble: 3 x 3
key val_x val_y
<dbl> <chr> <chr>
1 1 x1 y1
2 2 x2 y2
3 4 NA y3

> x %>%
+ full_join(y)
Joining, by = "key"
# A tibble: 4 x 3
key val_x val_y
<dbl> <chr> <chr>
1 1 x1 y1
2 2 x2 y2
3 3 x3 NA
4 4 NA y3

重复键值

  1. 情形一:只有一个表中有重复键值
R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
> x <- tribble(
+ ~key, ~val_x,
+ 1, "x1",
+ 2, "x2",
+ 2, "x3",
+ 1, "x4"
+ )

> y <- tribble(
+ ~key, ~val_y,
+ 1, "y1",
+ 2, "y2"
+ )

> x %>%
+ left_join(y, by = "key")
# A tibble: 4 x 3
key val_x val_y
<dbl> <chr> <chr>
1 1 x1 y1
2 2 x2 y2
3 2 x3 y2
4 1 x4 y1
  1. 情形二:两个表中都有重复键值
R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
> x <- tribble(
+ ~key, ~val_x,
+ 1, "x1",
+ 2, "x2",
+ 2, "x3",
+ 3, "x4"
+ )

> y <- tribble(
+ ~key, ~val_y,
+ 1, "y1",
+ 2, "y2",
+ 2, "y3",
+ 3, "y4"
+ )

> x %>% left_join(y)
Joining, by = "key"
# A tibble: 6 x 3
key val_x val_y
<dbl> <chr> <chr>
1 1 x1 y1
2 2 x2 y2
3 2 x2 y3
4 2 x3 y2
5 2 x3 y3
6 3 x4 y4

定义关键列

如果主表和副表中的key变量的名称不相同,可以使用如下的语句合并:

R
1
2
3
library(nycflights13)
flights %>%
left_join(airports, by = c("dest" = "faa"))

merge函数也能实现各种连接:

R
1
2
3
4
merge(x, y) # 内连接
merge(x, y, all.x = T) # 左连接
merge(x, y, all.y = T) # 右连接
merge(x, y, all.x = T, all.y = T) # 全连接
dplyr merge
inner_join(x, y) merge(x, y)
left_join(x, y) merge(x, y, all.x = TRUE)
right_join(x, y) merge(x, y, all.y = TRUE),
full_join(x, y) merge(x, y, all.x = TRUE, all.y = TRUE)

过滤连接

  • semi_join(x, y) 结果只含有完全匹配的观测值。

semi_join

  • anti_join(x, y) 结果只含有在y中匹配不到的观测值。

anti_join

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
> x <- tribble(
+ ~key, ~val_x,
+ 1, "x1",
+ 2, "x2",
+ 3, "x3"
+ )

> y <- tribble(
+ ~key, ~val_y,
+ 1, "y1",
+ 2, "y2",
+ 4, "y3"
+ )

> x %>%
+ semi_join(y)
Joining, by = "key"
# A tibble: 2 x 2
key val_x
<dbl> <chr>
1 1 x1
2 2 x2

> x %>%
+ anti_join(y)
Joining, by = "key"
# A tibble: 1 x 2
key val_x
<dbl> <chr>
1 3 x3

设置操作

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
df1 <- tribble(
~x, ~y,
1, 1,
2, 1
)
df2 <- tribble(
~x, ~y,
1, 1,
1, 2
)

intersect(df1, df2) # 返回在两个数据框中都有的观测值
union(df1, df2) # 返回两个数据框的并集
> dplyr::setdiff(df1, df2) # df1中逐个去除df2中的元素
# A tibble: 1 x 2
x y
<dbl> <dbl>
1 2 1

> dplyr::setdiff(df2, df1) # df2中逐个去除df1中的元素
# A tibble: 1 x 2
x y
<dbl> <dbl>
1 1 2

字符串

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
library(stringr)

# 字符串长度
str_length(c("a", "R for data science", NA))

# 字符串合并
str_c("x", "y", "z")
str_c("x", "y", "z", sep = ", ")

x <- c("a", NA)
str_c("|-", x, "-|")
# 如果想把NA作为字符串处理,可以使用str_replace_na()函数
str_c("|-", str_replace_na(x), "-|")

name <- "程振兴"
time_of_day <- "早上好!"
birthday <- FALSE

str_c(
time_of_day, " ", name,
"!",
if (birthday) "另外祝你生日快乐!"
)

# str_c是向量化的函数
str_c("prefix-", c("x", "y", "z"), "-suffix")

# 合并向量
str_c(c("x", "y", "z"), collapse = "")

# 字符串替换
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
str_sub(x, -3, -1)
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
R
1
2
3
4
5
6
7
8
9
10
11
# 不同语言环境的单词排序
x <- c("apple", "eggplant", "banana")
# English
str_sort(x, locale = "en")
# Hawaiian
str_sort(x, locale = "haw")

# 去除字符串两端的空格
str_trim(" apple ", side = "left")
str_trim(" apple ", side = "right")
str_trim(" apple ", side = "both")

正则表达式

R
1
2
3
4
5
6
7
8
9
10
# 正则表达式
x <- c("apple", "banana", "pear")
str_view(x, "an")
str_view(x, ".a.")
# 转义字符
str_view(c("abc", "a.c", "bef"), "a\\.c")

(x <- "a\\b")
writeLines(x)
str_view(x, "\\\\")

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# ^ 匹配字符串的开头。
# $ 匹配字符串的结尾。
x <- c("apple", "banana", "pear")
str_view(x, "^a")
str_view(x, "a$")

x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")
str_view(x, "^apple")
str_view(x, "apple$")

# 匹配以y开头的单词
(words <- stringr::words) %>% str_view("^y")
# 匹配以x结尾的单词
(words <- stringr::words) %>% str_view("x$")
# 匹配长度为3的单词
(words <- stringr::words) %>% str_view("^[a-z][a-z][a-z]$")
# 匹配有七个字母或更多。
(words <- stringr::words) %>% str_view(".[a-z][a-z][a-z][a-z][a-z][a-z][a-z].")

字符类和替代方案

R
1
2
3
4
5
6
7
8
9
# \d:匹配任何数字。
# \s:匹配任何空格(例如空格,制表符,换行符)。
# [abc]:匹配a,b或c。
# [^abc]:匹配除a,b或c之外的任何内容。

str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")
str_view(c("abc", "a.c", "a*c", "a c"), ".[*]c")
str_view(c("abc", "a.c", "a*c", "a c"), "a[ ]")
str_view(c("grey", "gray"), "gr(e|a)y")

重复

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# ?:0或1
# +:1或更多
# *:0或更多
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")
str_view(x, "CC+")
str_view(x, "CC*")

# {n}:完全是n
# {n,}:n或更多
# {,m}:最多m
# {n,m}:在n和m之间
str_view(x, "C{2}")
str_view(x, "C{1,3}")
str_view(x, "C{2,}")

# 默认使贪婪模式,它们将匹配可能的最长字符串。
# 也可以通过使用?来匹配最短的字符
str_view(x, "C{1,3}")
str_view(x, "C{1,3}?")
str_view(x, 'C[LX]+?')

分组和反向引用

R
1
2
# 查找具有重复字母对的所有水果。
str_view(fruit, "(..)\\1", match = TRUE)

检查匹配

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# 检查匹配
x <- c("apple", "banana", "pear")
# 是否匹配到e?
str_detect(x, "e")

# 以t开头的单词数
sum(str_detect(words, "^t"))
# 以元音结尾的单词比例
mean(str_detect(words, "[aeiou]$"))
# 含有元音的单词的比例
mean(str_detect(words, "[aeiou]"))

# 是否不含元音?
no_vowels_1 <- !str_detect(words, "[aeiou]")
no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
# 上面两种匹配方式是等效的
identical(no_vowels_1, no_vowels_2)

# 以x结尾的单词
words[str_detect(words, "x$")]
# 等价于
str_subset(words, "x$")

# 过滤出数据框中以x结尾的观测值
df <- tibble(
word = words,
i = seq_along(word)
)
df
df %>%
filter(str_detect(word, "x$"))

# 计算匹配数量
x <- c("apple", "banana", "pear")
str_count(x, "a")

# 统计单词的元音个数、辅音个数以及单词的长度
df %>%
mutate(
vowels = str_count(word, "[aeiou]"),
consonants = str_count(word, "[^aeiou]"),
length = str_length(word)
)

# 字符串"abababa"中"aba"出现了多少次?
str_count("abababa", "aba") # 2
str_view_all("abababa", "aba")

# 提取含颜色的句子
colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
# 注意str_extract()仅提取第一个匹配项。
str_subset(sentences, colour_match) %>% str_extract(colour_match)
# str_extract_all()提取所有匹配项。
str_subset(sentences, colour_match) %>% str_extract_all(colour_match)
# 如果使用simplify = TRUE,str_extract_all()将返回一个矩阵,其中短匹配扩展到与最长匹配的长度:
str_subset(sentences, colour_match) %>% str_extract_all(colour_match, simplify = T) %>% as_tibble()

x <- c("a", "a b", "a b c")
str_extract_all(x, "[a-z]", simplify = TRUE) %>% as_tibble()

# 查找“a”或“the”之后的任何单词:
noun <- "(a|the) ([^ ]+)"

has_noun <- sentences %>%
str_subset(noun) %>%
head(10)
has_noun %>%
str_extract(noun)

has_noun %>%
str_match(noun) %>%
as_tibble()

# tidyr::extract()也可以用于字符串匹配提取,并且还可以重新命名
tibble(sentences = sentences) %>%
tidyr::extract(
sentences, c("article", "noun"),
"(a|the) ([^ ]+)",
remove = F
)

# 匹配所有的匹配项。
has_noun %>%
str_match_all(noun)

替换、分割

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# 替换
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
str_replace_all(x, "[aeiou]", "-")

x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one",
"2" = "two",
"3" = "three"))

# 反转第2、3个单词的顺序
sentences %>%
str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>%
head(5)

# 分割
sentences %>%
head(1) %>%
str_split(" ")

"a|b|c|d" %>%
str_split("\\|") %>%
.[[1]]

# 与返回列表的其他字符串函数一样,您可以使用simplify = TRUE返回矩阵
sentences %>%
head(5) %>%
str_split(" ", simplify = T)
fields <- c("姓名:程振兴", "国家:中国", "年龄:35")
fields %>%
str_split(":", n = 2, simplify = T)

x <- "This is a sentence. This is another sentence."
str_view_all(x, boundary("word"))
x %>%
str_split(" ") %>%
.[[1]]

x %>%
str_split(boundary("word")) %>%
.[[1]]

# 定位到匹配成功的位置
str_locate(x, "is")
str_locate_all(x, "is")

其他类型

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
str_view(fruit, "nana")
# 上面的语句是下面语句的简写
str_view(fruit, regex("nana"))
# ignore_case = TRUE允许字符匹配大写或小写形式。
bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")
str_view(bananas, regex("banana",
ignore_case = T))

# multiline = TRUE允许^和$匹配每一行的开头和结尾,而不是整个字符串的开头和结尾。
x <- "Line 1\nLine 2\nLine 3"
writeLines(x)
str_extract_all(x, "^Line")[[1]]
str_extract_all(x, regex("^Line", multiline = T))[[1]]

# comments = TRUE允许您使用注释和空格来使复杂的正则表达式更容易理解。

phone <- regex("
\\(? # 可选的小括号
(\\d{3}) # 三位的数字
[) -]? # 小括号、空格或者短线
(\\d{3}) # 三位的数字
[ -]? # 可选的空格或短线
(\\d{3}) # 三个数字
", comment = T)
str_match("514-791-8141", phone)
# dotall = TRUE允许.匹配所有内容,包括\n。

# 你还可以使用其它三个函数代替regex():
# fixed():完全匹配指定的字节序列
microbenchmark::microbenchmark(
fixed = str_detect(sentences, fixed("the")),
regex = str_detect(sentences, "the"),
times = 30
)

# 注意使用fixed()匹配非英语数据可能会出问题,因为非英文字符通过有多种方式表示。
a1 <- "\u00e1"
a2 <- "a\u0301"
c(a1, a2)
identical(a1, a2) # FALSE

# coll():对于不区分大小写的匹配非常有用
i <- c("I", "İ", "i", "ı")
i
str_subset(i, coll("i", ignore_case = T))
# 对于其它语言环境
str_subset(i, coll("i", ignore_case = T,
locale = "tr"))
# 查看默认的语言环境
stringi::stri_locale_info()

正则表达式的其它用法

R
1
2
3
4
5
# apropos()可以用于搜索全局环境中可用的所有对象。
apropos("replace")

# dir()函数可以用于列出目录中的所有文件
head(dir(pattern = "\\.do$"))

stringi包

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
library(stringi)
# 计算单词的数量
stri_count_words(sentences) %>% head()

# 查找重复的字符串
stri_duplicated(c(
"the", "brown", "cow", "jumped", "over",
"the", "lazy", "fox"
))

# 生成随机文本
stri_rand_lipsum(nparagraphs = 1)
stri_rand_strings(n = 4, length = 5)

stri_rand_shuffle("我是程振兴")

# 字母排序
string1 <- c("hladny", "chladny", "bcade")
stri_sort(string1, locale = "en")
string2 <- c("number100", "number2")
stri_sort(string2, opts_collator = stri_opts_collator(numeric = T))

因子

默认情况下,ggplot2将删除没有任何值的级别。您可以强制它们显示:

R
1
2
3
ggplot(gss_cat, aes(race)) +
geom_bar() +
scale_x_discrete(drop = FALSE)

因子排序

R
1
2
3
4
5
6
7
8
9
10
11
12
13
relig_summary <- gss_cat %>%
group_by(relig) %>%
summarise(
age = mean(age, na.rm = TRUE),
tvhours = mean(tvhours, na.rm = TRUE),
n = n()
) %>%
mutate(
relig = fct_reorder(relig, tvhours)
)

# 将relig按照tvhours排序
ggplot(relig_summary, aes(tvhours, relig)) + geom_point()

fct_reorder2()通过y与最大值相关联的值重新排序因子x。这使得绘图更容易阅读,因为线条颜色与图例对齐:

R
1
2
3
4
5
6
7
8
9
10
11
gss_cat %>%
filter(!is.na(age)) %>%
count(age, marital) %>%
group_by(age) %>%
mutate(prop = n / sum(n)) %>%
ggplot(aes(
age, prop,
colour = fct_reorder2(
marital, age, prop))) +
geom_line() +
labs(colour = "marital")

最后,对于条形图,您可以使用fct_infreq()增加频率的顺序:

R
1
2
3
4
5
6
7
8
gss_cat %>%
mutate(
marital = marital %>%
fct_infreq() %>%
fct_rev()
) %>%
ggplot(aes(marital)) +
geom_bar()

修改因子水平

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
gss_cat %>% count(partyid)
gss_cat %>%
mutate(
partyid = fct_recode(
partyid,
"无回答" = "No answer",
"不知道" = "Don't know",
"其它" = "Other party"
)
) %>%
count(partyid)
# 你还可以把不同组修改到同一组
gss_cat %>%
mutate(
partyid = fct_recode(
partyid,
"其它" = "No answer",
"其它" = "Don't know",
"其它" = "Other party"
)
) %>%
count(partyid)

# fct_collapse()对于组合多个级别很方便
gss_cat %>%
mutate(partyid = fct_collapse(
partyid,
"其它" = c("No answer", "Don't know", "Other party")
)) %>%
count(partyid)

# 如果你想把所有的小组混合在一起,使用fct_lump():
gss_cat %>%
mutate(relig = fct_lump(relig)) %>%
count(relig)

# 默认情况是逐步将最小的组组合在一起,确保聚合之后的组仍然是最小组,也可以指定保留的组
gss_cat %>%
mutate(relig = fct_lump(relig, n = 10)) %>%
count(relig, sort = T)

日期和时间

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
library(lubridate)
today()
now()
ymd("2017-03-01")
mdy("Jan. 31st, 2017")
dmy("31-Jan-2017")
ymd(20170101)
ymd_hms("20170101201159")

# make_datetime()/make_date()
flights %>%
select(year, month, day, hour, minute) %>%
mutate(
departure = make_datetime(year, month, day, hour, minute),
date = make_date(year, month, day)
) %>%
select(departure, date)

获取组件

R
1
2
3
4
5
6
7
(datetime <- ymd_hms(now()))
datetime %>% year()
datetime %>% month()
datetime %>% day()
# 获取星期
datetime %>% wday(label = T, abbr = F) # 4表示周三
datetime %>% month(label = T, abbr = F)

日期的舍入

R
1
2
3
4
5
6
7
8
9
10
11
datetime %>%
floor_date("week") # 返回上一个周日的日期
datetime %>%
ceiling_date("week") # 返回下一个周日的日期

datetime %>%
round_date("week")
datetime %>%
round_date("month")
datetime %>%
round_date("year")

设置组件

R
1
2
3
4
5
datetime
hour(datetime) <- hour(datetime) + 1
datetime

update(datetime, year = 2020, month = 2)

时间跨度

R
1
2
3
4
5
> age <- today() - ymd(19960624)
> age
Time difference of 8248 days
> age %>% as.duration()
[1] "712627200s (~22.58 years)"

持续时间

R
1
2
3
4
5
6
7
8
9
dseconds(15)
days_in_month("2012-02-01" %>% ymd())
dweeks(3)
dyears(22)
ddays(0:5)
dhours(c(12, 20))
# 可以在天数之间添加和减去持续时间
(tomorrow <- today() + ddays(1))
(last_year <- today() - dyears(1))

时间段

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
seconds(15)
minutes(10)
hours(c(12, 24))
years(1)
10 * (months(6) + days(1))

ymd("2016-01-01") + dyears(1)
ymd("2016-01-01") + years(1)

years(1)/days(1) # 365.25

next_year <- today() + years(1)
(today() %--% next_year) / ddays(1) # 365

(today() %--% next_year) %/% ddays(1)

管道操作符

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
library(magrittr)
assign("x", 10)
("x" %>% assign(100))

tryCatch(stop("!"), error = function(e) "An error!")

# 点点
commas <- function(...){
stringr::str_c(..., collapse = ", ")
}
commas(letters[1:10])

# getOption("width"):获取控制台的宽度
rule <- function(..., pad = "-"){
title <- paste0(...)
width <- getOption("width") - nchar(title) - 5
print(width)
cat(title, " ", stringr::str_dup(pad, width), "\n", sep = "")
}

rule("重要的输出:")

# 编写可管理的函数
# invisible()函数可以隐藏输出
show_missings <- function(df){
n <- sum(is.na(df))
cat("缺失值的数量为:", n, "\n", sep = "")
invisible(df)
}
show_missings(flights)
x <- show_missings(flights)
dim(x) # 可以看出x是个数据框

# 定义操作符
`%czx%` <- function(x, y){
return(sum(x, y))
}
10 %czx% 100

向量

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
set_names(1:3, c("a", "b", "c"))

# 属性
x <- 1:10
attr(x, "greeting")
attr(x, "greeting") <- "Hi!"
attr(x, "farewell") <- "Bye!"
attributes(x)

x <- ymd_hms(now())
x
unclass(x)
attr(x, "tzone") <- "US/Pacific"
x
y <- as.POSIXlt(x)
y
attributes(y)

(tb <- tibble(x = 1:5, y = 5:1))
typeof(tb)

df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)

output <- vector("double", ncol(df))
for(i in seq_along(df)){
output[[i]] <- median(df[[i]])
}
output

means <- c(0, 1, 2)
out <- vector("list", length(means))
for(i in seq_along(means)){
n <- sample(100, 1)
out[[i]] <- rnorm(n, means[[i]])
}
str(out)
# unlist()函数可以将矢量列表展平为单个矢量。
str(unlist(out))

map函数

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# map() 映射产生列表。
# map_lgl() 映射产生一个逻辑向量。
# map_int() 映射产生一个整数向量。
# map_dbl() 映射产生一个double向量。
# map_chr() 映射产生一个字符矢量。
map_dbl(df, mean)
map_dbl(df, median)
z <- list(x = 1:3, y = 4:5)
map_int(z, length)

# 将mtcars数据集拆分为三个部分(每个圆柱体一个),并为每个部分拟合相同的线性模型:
models <- mtcars %>%
split(.$cyl) %>%
map(function(df) lm(mpg ~ wt, data = df))
# 上面的代码等价于:
models <- mtcars %>%
split(.$cyl) %>%
map(~lm(mpg ~ wt, data = .))

models %>%
map(summary) %>%
map_dbl(~.$r.squared) %>%
round(3)
# 下面的代码功能相同
models %>%
map(summary) %>%
map_dbl("r.squared") %>%
round(3)

# 还可以按照位置选择元素
x <- list(list(1, 2, 3), list(4, 5, 6), list(7, 8, 9))
x %>% map_dbl(2)

base R

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# lapply()基本等同于,map()
# sapply()可以自动简化输出

x1 <- list(
c(0.27, 0.37, 0.57, 0.91, 0.20),
c(0.90, 0.94, 0.66, 0.63, 0.06),
c(0.21, 0.18, 0.69, 0.38, 0.77)
)
x2 <- list(
c(0.50, 0.72, 0.99, 0.38, 0.78),
c(0.93, 0.21, 0.65, 0.13, 0.27),
c(0.39, 0.01, 0.38, 0.87, 0.34)
)

threshold <- function(x, cutoff = 0.8){
x[x > cutoff]
}
x1 %>% sapply(threshold) %>% str()
x1 %>% lapply(threshold) %>% str()
x1 %>% map(threshold) %>% str()

x2 %>% sapply(threshold) %>% str()
x2 %>% lapply(threshold) %>% str()
x2 %>% map(threshold) %>% str()

# vapply()函数是一个安全的替代方法
x2 %>% vapply(threshold, FUN.VALUE = 0) %>% str()

映射多个参数

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
mu <- list(5, 10, -3)
mu %>%
map(rnorm, n = 5) %>%
str()

sigma <- list(1, 5, 10)
seq_along(mu) %>%
map(~rnorm(5, mu[[.]], sigma[[.]])) %>%
str()
# 但是这样会有一种混淆的感觉,使用map2()更方便
map2(mu, sigma, rnorm, n = 5) %>%
str()

# 如果有多个参数要映射呢?
n <- list(1, 3, 5)
args <- list(n, mu, sigma)
args %>%
pmap(rnorm) %>%
str()

# 下面的方法更可靠
list(mean = mu, sd = sigma, n = n) %>%
pmap(rnorm) %>%
str()

params <- tribble(
~mean, ~sd, ~n,
5, 1, 1,
10, 5, 3,
-3, 10, 5
)

params %>%
pmap(rnorm) %>%
str()

# 调用不同的函数
f <- c("runif", "rnorm", "rpois")
param <- list(
list(min = -1, max = 1),
list(sd = 5),
list(lambda = 10)
)
invoke_map(f, param, n = 5) %>% str()

sim <- tribble(
~f, ~params,
"runif", list(min = -1, max = 1),
"rnorm", list(sd = 5),
"rpois", list(lambda = 10)
)
sim %>%
mutate(sim = invoke_map(f, params, n = 10))

walk()函数

R
1
2
3
4
5
6
7
8
9
10
11
# 当你想要调用函数的副作用而不是其返回值的时候,walk()函数是个不错的替代方法
x <- list(1, "a", 3)
x %>%
walk(print)
# 如果您有一个图表列表和一个文件名向量,您可以使用pwalk()将每个文件保存到磁盘上的相应位置:
plots <- mtcars %>%
split(.$cyl) %>%
map(~ggplot(., aes(mpg, wt)) + geom_point() + theme_bw())
paths <- stringr::str_c(names(plots), ".pdf")

pwalk(list(paths, plots), ggsave, path = getwd())

谓词函数

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
iris %>%
keep(is.factor) %>%
str()

iris %>%
discard(is.factor) %>%
str()

x <- list(1:5, letters, list(10))
x %>% str()
x %>%
some(is_character)

x %>%
every(is_vector)

(x <- sample(10))
x %>%
detect(~ . >5)
x %>%
detect_index(~ . > 5)
x %>%
head_while(~ . > 5)
x %>%
tail_while(~ . > 5)

reduce()/accumulate()函数

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# reduce()函数
# 多个tibble的整合
dfs <- list(
age = tibble(name = "John", age = 30),
sex = tibble(name = c("John", "Mary"), sex = c("M", "F")),
trt = tibble(name = "Mary", treatment = "A")
)
dfs %>% reduce(full_join)

# 找到向量的交集
vs <- list(
c(1, 3, 5, 6, 10),
c(1, 2, 3, 7, 8, 10),
c(1, 2, 3, 4, 8, 9, 10)
)
vs %>% reduce(intersect)

# accumulate()函数
x <- sample(10)
x %>% accumulate(`+`)
x %>% accumulate(`%czx%`)
# R

评论

程振兴

程振兴 @czxa.top
截止今天,我已经在本博客上写了607.9k个字了!

Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×