宅男女神网 是我第一次爬的图片网站,用 Stata 爬的,后来就没有再爬过了,上面的图片比较乱,今天又上去看了一下,发现可以爬所有模特的数据。所以就爬下来玩了玩,模特挺多的,一共是 8320 个模特。
爬数据 怎么去爬这些模特数据呢,首先我找到了这个页面:找美女 :
我把年龄的选择条范围拉到了最大。最下面显示有 420 页,分析刚刚的操作的请求,发现是个 POST 请求,所以我的想法是先用 curl 把这 400 多个页面全部下载下来,然后再处理得到所有模特的主页地址,最后分别进入每个模特的主页爬取到她们的数据信息。
curl + Stata 是绝配:
Stata 1 2 3 forval i = 1/422{ !curl 'https: }
注意这里我把年龄设置为 0-100,这样会下载到 422 个 html 文件,然后接下来用 R 进行逐个提取:
R 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 library (rvest)library (progress)library (io)library (stringr)library (lubridate)library (formattable)home <- data.frame( title = NA , link = NA ) pb <- progress_bar$new(total = 422 ) for (i in 1 :422 ){ pb$tick(0 ) pb$tick() try ( html <- read_html(paste0('temp' , i, '.html' ), encoding = 'UTF-8' ) ) for (j in 1 :20 ){ try ( home <- rbind( home, data_frame( title = html %>% html_node(xpath = paste0('/html/body/ul/li[' , j, ']/div/div/a' )) %>% html_text(), link = html %>% html_node(xpath = paste0('/html/body/ul/li[' , j, ']/div/div/a' )) %>% html_attr('href' ) )) ) } } home <- home[!duplicated(home$title),] home <- subset(home, !is.na(home$title)) home$link <- paste0('https://www.nvshens.com' , home$link) qwrite(home, 'nvshenslink.rds' )
这样的得到的 home 数据框里有两个变量,一个属模特的名字,另一个是模特的链接,如果你不想运行上面的循环,可以直接下载我的结果:nvshenslink.rds
下面进入每个模特的主页爬取数据:R 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 rm(list = ls()) home <- qread('nvshenslink.rds' ) df <- data.frame( name = NA , age = NA , birthday = NA , constellation = NA , height = NA , weight = NA , vital = NA , birthplace = NA , job = NA , interest = NA , info = NA , link = NA , score = NA , scorepeople = NA , img = NA ) pb <- progress_bar$new(total = length(home$link)) for (i in 1 :length(home$link)){ pb$tick(0 ) pb$tick() try (html <- read_html(home$link[i])) table <- html %>% html_node(css = '#post > div:nth-child(2) > div > div.infodiv > table' ) %>% html_table() table$X1 <- gsub(pattern = '[: ]' , replacement = '' , table$X1) row.names(table) <- table$X1 try ( df <- rbind( df, data.frame( name = html %>% html_node(xpath = '//*[@id="post"]/div[2]/div/div[1]/h1' ) %>% html_text(), age = table[c('年龄' ), c('X2' )], birthday = table[c('生日' ), c('X2' )], constellation = table[c('星座' ), c('X2' )], height = table[c('身高' ), c('X2' )], weight = table[c('体重' ), c('X2' )], vital = table[c('三围' ), c('X2' )], birthplace = table[c('出生' ), c('X2' )], job = table[c('职业' ), c('X2' )], interest = table[c('兴趣' ), c('X2' )], info = html %>% html_node(xpath = '//*[@id="post"]/div[5]/div/div[1]/div[2]/p' ) %>% html_text(), link = home$link[i], score = html %>% html_node(xpath = '//*[@id="span_score"]' ) %>% html_text(), scorepeople = html %>% html_node(xpath = '//*[@id="lbl_score"]' ) %>% html_text(), img = html %>% html_node(xpath = '//*[@id="post"]/div[2]/div/div[3]/a/img' ) %>% html_attr('src' ) ) ) ) } df <- subset(df, !is.na(df$name)) df <- df[!duplicated(df$link),] qwrite(df, 'model.rds' )
同样上面的循环非常耗时易出错,如果你不想运行,可以下载我爬好的:model.rds
下面就可以开始分析了:
整理数据 首先整理整理数据:
R 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 rm(list = ls()) df <- qread('model.rds' ) df <- df %>% mutate( zodiac = gsub(pattern = '[0-9\\(\\) 属]' , replacement = '' , age), age = str_extract(age, pattern = '(.*) \\(' ) %>% gsub(x = ., pattern = ' \\(' , replacement = '' ) %>% as.numeric(), birthday = ymd(birthday), height = as.numeric(height), weight = gsub(' KG' , '' , weight) %>% as.numeric(), birthcountry = gsub(pattern = ' .*' , replacement = '' , birthplace), birthcity = gsub(pattern = '.* ' , replacement = '' , birthplace), bust = gsub(' W.*' , '' , vital) %>% gsub(x = ., pattern = '\\(.*\\)' , replacement = '' ) %>% gsub(x = ., pattern = 'B' , replacement = '' ) %>% as.numeric(), waist = gsub(x = vital, pattern = ' H.*' , replacement = '' ) %>% gsub(x = ., pattern = '.*W' , replacement = '' ) %>% gsub(x = ., pattern = '\\(.*\\)' , replacement = '' ) %>% gsub(x = ., pattern = 'B*' , replacement = '' ) %>% as.numeric(), hipline = gsub('.*H' , '' , vital) %>% gsub(x = ., pattern = '\\(.*\\)' , replacement = '' ) %>% gsub(x = ., pattern = 'B*' , replacement = '' ) %>% gsub(x = ., pattern = 'W*' , replacement = '' ) %>% gsub(x = ., pattern = '* W*' , replacement = '' ) %>% as.numeric(), score = score %>% as.numeric(), scorepeople = scorepeople %>% as.numeric() ) %>% arrange(desc(scorepeople), desc(score)) ``` ## 分析数据 ### 评价人数最多、评分最高的十个模特 首先是评价人数最多、评分最高的十个模特: ```r R knitr::kable(df %>% slice(1 :10 ) %>% select(name, age, score, link))
模特的年龄分布 R 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 df %>% group_by(age) %>% summarise( n = length(age)/8320 , bust = sum(bust, na.rm = T )/length(bust) ) %>% ggplot() + geom_col(aes(x = age, y = n, fill = bust)) + scale_fill_gradient('胸围' , low = '#66c2a5' , high = '#fc8d62' , breaks = c(0 , 20 , 40 , 60 , 80 ), labels = suffix(c(0 , 20 , 40 , 60 , 80 ), suffix = 'cm' )) + labs(x = '年龄' , y = '比例' , title = '宅男女神网上各个年龄的模特的比例' , subtitle = '一共8320个模特' , caption = '数据来源:宅男女神网\nhttps://www.nvshens.com' ) + scale_y_percent() + scale_x_continuous( breaks = c(20 , 30 , 40 , 50 ), labels = suffix(c(20 , 30 , 40 , 50 ), suffix = '岁' ))
胸围最大的十个模特 R 1 2 3 4 knitr::kable(subset(df, df$bust > 50 ) %>% arrange(desc(bust)) %>% select(name, age, bust, vital, link) %>% slice(1 :10 ))
胸围最小的十个模特 (胸围小于 50 的数据标准可能不一样,所以舍弃):
R 1 2 3 4 knitr::kable(subset(df, df$bust > 50 ) %>% arrange(bust) %>% select(name, age, bust, vital, link) %>% slice(1 :10 ))
胸围的分布 R 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 subset(df, df$bust > 50 ) %>% group_by(bust) %>% summarise( n = length(bust), age = sum(age, na.rm = T )/length(age) ) %>% ggplot() + geom_col(aes(x = bust, y = n, fill = age)) + labs(x = '胸围' , y = '数量' , title = '宅男女神网模特胸围分布' , subtitle = '一共8320个模特' , caption = '数据来源:宅男女神网\nhttps://www.nvshens.com' ) + scale_x_continuous( breaks = c(60 , 80 , 100 , 120 ), labels = suffix(c(60 , 80 , 100 , 120 ), "cm" ) ) + scale_fill_gradient('年龄' , low = '#66c2a5' , high = '#fc8d62' )
国籍与胸围 R 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 subset(df, df$bust > 50 ) %>% mutate(birthcountry = case_when( grepl("zg" , birthcountry) ~ "中国" , grepl("中国" , birthcountry) ~ "中国" , grepl("日本" , birthcountry) ~ "日本" , grepl("韩国" , birthcountry) ~ "韩国" , grepl("美国" , birthcountry) ~ "美国" , is.na(birthcountry) ~ "其它" , T ~ "其它" )) %>% group_by(bust, birthcountry) %>% summarise( n = length(bust) ) %>% ggplot() + geom_col(aes(x = bust, y = n, fill = birthcountry)) + labs(x = '胸围' , y = '数量' , title = '宅男女神网模特胸围与国家分布' , subtitle = '一共8320个模特' , caption = '数据来源:宅男女神网\nhttps://www.nvshens.com' ) + scale_x_continuous( breaks = c(60 , 80 , 100 , 120 ), labels = suffix(c(60 , 80 , 100 , 120 ), "cm" ) ) + scale_fill_brewer( "国家" , palette = 'Set1' , breaks = c("中国" , "日本" , "韩国" , "美国" , "其它" ))
各个国家模特的平均胸围 先统计各国模特的数量:
R 1 2 3 4 5 6 7 country <- df %>% group_by(birthcountry) %>% summarise(n = length(birthcountry)) %>% arrange(desc(n)) %>% subset(!is.na(birthcountry)) knitr::kable(country %>% slice(1 :10 ))
birthcountry
n
中国
3869
日本
3054
韩国
360
美国
272
马来西亚
122
泰国
102
越南
41
澳大利亚
39
加拿大
34
俄罗斯
32
R 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 subset(df, df$bust > 50 ) %>% mutate(birthcountry = case_when( grepl("zg" , birthcountry) ~ "中国" , grepl("中国" , birthcountry) ~ "中国" , grepl("日本" , birthcountry) ~ "日本" , grepl("韩国" , birthcountry) ~ "韩国" , grepl("美国" , birthcountry) ~ "美国" , grepl("越南" , birthcountry) ~ "越南" , grepl("马来西亚" , birthcountry) ~ "马来西亚" , grepl("泰国" , birthcountry) ~ "泰国" , grepl("澳大利亚" , birthcountry) ~ "澳大利亚" , grepl("加拿大" , birthcountry) ~ "加拿大" , grepl("俄罗斯" , birthcountry) ~ "俄罗斯" , is.na(birthcountry) ~ "其它" , T ~ "其它" )) %>% group_by(birthcountry) %>% summarise( n = mean(bust, na.rm = T ) ) %>% ggplot() + geom_col(aes(x = factor(birthcountry, levels = c(country$birthcountry[1 :10 ], "其它" )), y = n, fill = factor(birthcountry, levels = c(country$birthcountry[1 :10 ], "其它" )))) + geom_text(aes(x = factor(birthcountry, levels = c(country$birthcountry[1 :11 ], "其它" )), y = n + 5 , label = paste0(round(n), "cm" ))) + labs(x = '国家' , y = '平均胸围' , title = '宅男女神网各个国家模特的平均胸围' , subtitle = '一共8320个模特' , caption = '数据来源:宅男女神网\nhttps://www.nvshens.com' ) + scale_fill_brewer(palette = 'Set3' ) + guides(fill = 'none' )
可以看到各国国家的模特的胸围不分上下。
模特们的职业 R 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 library (tidytext)job <- df %>% unnest_tokens(job, job, token = stringr::str_split, pattern = "、" ) %>% group_by(job) %>% summarise(n = length(job)) %>% arrange(desc(n)) job %>% slice(1 :10 )%>% ggplot(aes(x = factor(job), y = n, fill = factor(job))) + geom_col() + scale_fill_brewer(palette = 'Set3' ) + guides(fill = "none" ) + labs(x = '职业' , y = '数量' , title = '模特们从事的前十大职业' , caption = '数据来源:宅男女神网\nhttps://www.nvshens.com' )
R 1 2 library (rWordCloud)d3Cloud(text = job$job, size = job$n)
模特们的星座分布 R 1 2 3 4 5 6 7 8 9 10 11 12 df %>% group_by(constellation, age) %>% summarise( n = length(constellation) ) %>% ggplot(aes(x = factor(constellation), y = n, fill = age)) + geom_col() + scale_fill_gradient('年龄' , low = '#66c2a5' , high = '#fc8d62' ) + theme(legend.position = 'top' ) + labs(x = '星座' , y = '数量' , title = '模特们的星座分布' , subtitle = '一共8320个模特' , caption = '数据来源:宅男女神网\nhttps://www.nvshens.com' )
模特们的属相分布 R 1 2 3 4 5 6 7 8 9 10 11 df %>% group_by(zodiac, age) %>% summarise( n = length(zodiac) ) %>% ggplot(aes(x = factor(zodiac), y = n, fill = age)) + geom_col() + scale_fill_gradient('年龄' , low = '#66c2a5' , high = '#fc8d62' ) + labs(x = '属相' , y = '数量' , title = '模特们的属相分布' , subtitle = '一共8320个模特' , caption = '数据来源:宅男女神网\nhttps://www.nvshens.com' )
卧槽!模特们还真的是属鸡的多!
模特们的得分分布 R 1 2 3 4 5 6 7 8 9 10 11 df %>% group_by(score, age) %>% summarise( n = length(score) ) %>% ggplot(aes(x = score, y = n, fill = age)) + geom_col() + scale_fill_gradient('年龄' , low = '#66c2a5' , high = '#fc8d62' ) + labs(x = '属相' , y = '分数' , title = '模特们的得分分布' , subtitle = '一共8320个模特' , caption = '数据来源:宅男女神网\nhttps://www.nvshens.com' )