足球队排名与市值数据爬取于可视化

足球队排名与市值数据爬取于可视化

本文是A World Cup 2018 primer, with graphs!的学习笔记,讲述了如何爬取世界杯足球队的FIFA排名和市值数据。不同于作者使用css选择器进行表格定位,我更喜欢使用xpath对表格进行定位。因此特别需要强调的两点是:

1. 本人对足球和世界杯一窍不通且没什么兴趣;
2. 本文的重点在于如何绘制图表。

首先爬取数据:

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
library(tidyverse)
library(rvest)
library(awtools)
world.rank <- read_html("https://www.fifa.com/fifa-world-ranking/ranking-table/men/") %>%
html_nodes('table') %>%
html_table() %>%
data.frame(.[1]) %>%
select(c(1, 2, 3, 14))

names(world.rank) <- c("Rank", "Team",
"Total.Points",
"Confederations")
world.rank$Team <- gsub(pattern = '\r\n [A-Z][A-Z][A-Z]', replacement = '', world.rank$Team)
world.rank <- world.rank %>%
mutate(Team = case_when(
Team == 'IR Iran' ~ 'Iran',
Team == 'Korea Republic' ~ 'South Korea',
T ~ Team
))
world.rank$Confederations <- gsub(pattern = '#', replacement = '', world.rank$Confederations)

squads1 <- read_html('https://www.transfermarkt.com/world-cup-2018/teilnehmer/pokalwettbewerb/WM18') %>%
html_node(xpath = '//*[@id="yw1"]/table') %>%
html_table(fill = T) %>%
data.frame(.[1]) %>%
select(c(2, 3, 4, 6, 7)) %>%
transmute(
Team = Club,
Age = as.numeric(gsub(',', '.', Squad)),
Percent.Abroad = unlist(lapply(
strsplit(gsub(',', '.', WC.particip.), ' '), '[', 1
)),
Market.Value = parse_number(gsub(',', '.', Abroad))
) %>%
mutate(
Market.Value = as.numeric(
ifelse(Team %in% c('France', 'Spain'), Market.Value*1000000000, Market.Value*1000000)),
Percent.Abroad = as.numeric(Percent.Abroad)
)

squads2 <- read_html('https://www.transfermarkt.com/world-cup-2018/teilnehmer/pokalwettbewerb/WM18') %>%
html_node(xpath = '//*[@id="yw2"]/table') %>%
html_table(fill = T) %>%
data.frame(.[1]) %>%
select(c(2, 3, 4, 6, 7)) %>%
transmute(
Team = Club,
Age = as.numeric(gsub(',', '.', Squad)),
Percent.Abroad = unlist(lapply(
strsplit(gsub(',', '.', WC.particip.), ' '), '[', 1
)),
Market.Value = parse_number(gsub(',', '.', Abroad))
) %>%
mutate(
Market.Value = as.numeric(
ifelse(Team %in% c('France', 'Spain'), Market.Value*1000000000, Market.Value*1000000)),
Percent.Abroad = as.numeric(Percent.Abroad)
)

squads <- rbind(squads1, squads2)
squad.rank <- inner_join(squads, world.rank)

然后绘制各足球队的排名和市值的关系图:

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
ggplot(squad.rank,
aes(reorder(Team, -Rank),
Market.Value,
color = Confederations)) +
geom_point() +
geom_linerange(aes(ymin = 0,
ymax = Market.Value)) +
scale_y_continuous(labels = scales::comma) +
geom_text(aes(
label = m.compress(Market.Value)
),
check_overlap = T,
family = 'STSong',
colour = '#444444',
hjust = -0.25, size = 5
) +
coord_flip() +
a_primary_color('邦联') +
labs(title = '踢更好的球,拿更少的钱',
subtitle = '世界杯足球队FIFA排名和市值',
x = '足球队(按照FIFA排名)',
y = '市值(€)',
caption = 'Market Value and Team data from Transfer Markt\nWorld Ranking and Confederation from FIFA') +
theme_bw(base_family = 'STSong',
base_size = 16) +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.ticks = element_blank(),
plot.subtitle = element_text(size = 12),
plot.margin = grid::unit(c(1, 1, 1, 1), "cm"))

绘制各个球队国外球员的比例:

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
ggplot(squad.rank,
aes(reorder(Team, -Rank),
Percent.Abroad,
fill = Confederations)) +
geom_bar(stat = 'identity') +
coord_flip() +
geom_text(
aes(label = Percent.Abroad),
check_overlap = T,
family = 'STSong',
colour = '#444444',
hjust = -0.25,
size = 5) +
a_primary_fill() +
labs(title = '外国球员的比例',
subtitle = 'A look at World Cup teams by FIFA rank and percent of players who play abroad.',
x = 'Team (by FIFA rank)',
y = 'Percent Abroad',
caption = 'Team data from Transfer Markt\nWorld Ranking and Confederation from FIFA') +
theme_bw(base_family = 'STSong',
base_size = 16) +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.ticks = element_blank(),
plot.subtitle = element_text(size = 12),
plot.margin = grid::unit(c(1, 1, 1, 1), "cm"))

再看看看各个邦联的一些统计数据:

R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
confederation <- squad.rank %>% 
group_by(Confederations) %>%
summarise(
mean.age = mean(Age),
mean.abroad = mean(Percent.Abroad),
mean.value = mean(Market.Value),
mean.rank = median(Rank)
) %>%
ungroup() %>%
gather(type, value, 2:5)

ggplot(confederation,
aes(x = Confederations,
y = value, fill = type)) +
geom_bar(stat = 'identity',
show.legend = F) +
coord_flip() +
facet_wrap(~type, scales = 'free', ncol = 2) +
scale_y_continuous(labels = m.compress) +
hrbrthemes::theme_ipsum(base_family = 'STSongti-SC-Bold') +
a_secondary_fill()

# R

评论

Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×