BT之家网站图片爬取

BT之家网站图片爬取

友情提示:本文内容少儿不宜。

最近发现的这个网站:bt之家,可以下载到很多电影,除此之外,网站还有大量少儿不宜的资源。好久没有用Python了,于是就爬了一下这个网站上的图片。

爬取思路非常老套:

  1. 获取所有套路的链接;
  2. 获取每个套图的图片链接;
  3. 下载图片。

虽然这个网站没有设置反爬,但是为了防止为了其增加反爬,所以我还是模拟浏览器进行的爬取。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# 主页:http://www.btbtt01.com/forum-index-fid-8-page-1.htm
import requests
import re
import pandas as pd
import os
from bs4 import BeautifulSoup
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) " \
"A®ppleWebKit/537.36 (KHTML, like Gecko) " \
"Chrome/66.0.3359.181 Safari/537.36",
"Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
"Referer": "http://m.mzitu.com/138401",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"}

# 首先是获取总页数:
def get_pagecount():
url = "http://www.btbtt01.com/forum-index-fid-8-page-1.htm"
req = requests.get(url, headers=headers)
bsObj = BeautifulSoup(req.text, 'html.parser')
totalpage_temp = bsObj.findAll("div", {"class": "page"})[0].findAll("a")[10].get_text()
return int(re.findall(r"... (.*)", totalpage_temp)[0])

# 爬取所有页面的套图链接
def alllink(pagecount):
pagecount += 1
link = []
for j in range(1, pagecount):
url = "http://www.btbtt01.com/forum-index-fid-8-page-" + str(j) + ".htm"
req = requests.get(url, headers=headers)
bsObj = BeautifulSoup(req.text, 'html.parser')
linklist1 = bsObj.findAll("a", {"class": "subject_link thread-digest-1"})
linklist2 = bsObj.findAll("a", {"class": "subject_link thread-new"})
linklist3 = bsObj.findAll("a", {"class": "subject_link thread-old"})
templink = []
for i in range(0, len(linklist1)):
templink.append(linklist1[i].attrs['href'])
for i in range(0, len(linklist2)):
templink.append(linklist2[i].attrs['href'])
for i in range(0, len(linklist3)):
templink.append(linklist3[i].attrs['href'])
link += templink
print("第" + str(j) + "页完成!")
print("一共%d组图片" % len(link))
name = ['url']
test = pd.DataFrame(columns = name, data = link)
test.to_csv("bt.csv")
return link

# 爬取一个页面的所有图片链接
def get_pic_link(pageurl):
pagereq = requests.get(pageurl, headers = headers)
pagebsObj = BeautifulSoup(pagereq.text, 'html.parser')
img = pagebsObj.findAll("div", {"class": "message"})[0].findAll("img")
piclink = []
for i in range(0, len(img)):
piclink.append(img[i].attrs['src'])
return piclink

# 下载图片:
def download_pic(url, name):
res = requests.get(url, headers = headers)
res.raise_for_status()
playFile = open(name, 'wb')
for chunk in res.iter_content(100000):
playFile.write(chunk)
playFile.close()

# 获取套图的名称
def get_title(pageurl):
pagereq = requests.get(pageurl, headers = headers)
pagebsObj = BeautifulSoup(pagereq.text, 'html.parser')
return pagebsObj.findAll("title")[0].get_text().replace(" ", "")

# 一个根据给定字符串创建字文件夹并设定子文件夹为工作目录的函数
def make_and_cd(foldername):
import os
def mkdir(path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
print("--- 创建新文件夹... ---")
print("--- 创建完成 ---")
else:
print("--- 文件夹已存在! ---")
mkdir(foldername)
os.chdir(foldername)

## 开始全站爬取
os.chdir("/Users/mr.cheng/Desktop/mzitu爬取/")
make_and_cd("bt")
cwd = os.getcwd()
all_links = alllink(get_pagecount())
df = pd.read_csv("bt.csv")
all_links = list(df.url)
total_length = len(all_links)
print(total_length)
progress = 0
for i in all_links:
progress += 1
speed = progress / total_length * 100
title = get_title(i)
if not os.path.exists(title):
print("正在下载:", title)
make_and_cd(title)
m = 1
for j in get_pic_link(i):
try:
download_pic(j, name = str(m)+".jpg")
except Exception as e:
print(e)
m += 1
print("下载成功")
os.chdir(cwd)
speed = progress/total_length * 100
print("已完成", speed, "%")

爬取结果:

为了方便使用,可以将其改成一个命令行工具。首先我把工作目录设置到我的电脑里面存放图片的地方:
/Users/mr.cheng/Pictures/图片/BT之家/
然后在py文件的开头加上:
#!/Users/mr.cheng/anaconda3/bin/python

也就是这个文件是这个样子的:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/Users/mr.cheng/anaconda3/bin/python

# 主页:http://www.btbtt01.com/forum-index-fid-8-page-1.htm
import requests
import re
import pandas as pd
import os
from bs4 import BeautifulSoup
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) " \
"A®ppleWebKit/537.36 (KHTML, like Gecko) " \
"Chrome/66.0.3359.181 Safari/537.36",
"Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
"Referer": "http://m.mzitu.com/138401",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"}

# 首先是获取总页数:
def get_pagecount():
url = "http://www.btbtt01.com/forum-index-fid-8-page-1.htm"
req = requests.get(url, headers=headers)
bsObj = BeautifulSoup(req.text, 'html.parser')
totalpage_temp = bsObj.findAll("div", {"class": "page"})[0].findAll("a")[10].get_text()
return int(re.findall(r"... (.*)", totalpage_temp)[0])

# 爬取所有页面的套图链接
def alllink(pagecount):
pagecount += 1
link = []
for j in range(1, pagecount):
url = "http://www.btbtt01.com/forum-index-fid-8-page-" + str(j) + ".htm"
req = requests.get(url, headers=headers)
bsObj = BeautifulSoup(req.text, 'html.parser')
linklist1 = bsObj.findAll("a", {"class": "subject_link thread-digest-1"})
linklist2 = bsObj.findAll("a", {"class": "subject_link thread-new"})
linklist3 = bsObj.findAll("a", {"class": "subject_link thread-old"})
templink = []
for i in range(0, len(linklist1)):
templink.append(linklist1[i].attrs['href'])
for i in range(0, len(linklist2)):
templink.append(linklist2[i].attrs['href'])
for i in range(0, len(linklist3)):
templink.append(linklist3[i].attrs['href'])
link += templink
print("第" + str(j) + "页完成!")
print("一共%d组图片" % len(link))
name = ['url']
test = pd.DataFrame(columns = name, data = link)
test.to_csv("bt.csv")
return link

# 爬取一个页面的所有图片链接
def get_pic_link(pageurl):
pagereq = requests.get(pageurl, headers = headers)
pagebsObj = BeautifulSoup(pagereq.text, 'html.parser')
img = pagebsObj.findAll("div", {"class": "message"})[0].findAll("img")
piclink = []
for i in range(0, len(img)):
piclink.append(img[i].attrs['src'])
return piclink

# 下载图片:
def download_pic(url, name):
res = requests.get(url, headers = headers)
res.raise_for_status()
playFile = open(name, 'wb')
for chunk in res.iter_content(100000):
playFile.write(chunk)
playFile.close()

# 获取套图的名称
def get_title(pageurl):
pagereq = requests.get(pageurl, headers = headers)
pagebsObj = BeautifulSoup(pagereq.text, 'html.parser')
return pagebsObj.findAll("title")[0].get_text().replace(" ", "")

# 一个根据给定字符串创建字文件夹并设定子文件夹为工作目录的函数
def make_and_cd(foldername):
import os
def mkdir(path):
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
print("--- 创建新文件夹... ---")
print("--- 创建完成 ---")
else:
print("--- 文件夹已存在! ---")
mkdir(foldername)
os.chdir(foldername)

## 开始全站爬取
os.chdir("/Users/mr.cheng/Pictures/图片/BT之家/")
make_and_cd("bt")
cwd = os.getcwd()
all_links = alllink(get_pagecount())
df = pd.read_csv("bt.csv")
all_links = list(df.url)
total_length = len(all_links)
print(total_length)
progress = 0
for i in all_links:
progress += 1
speed = progress / total_length * 100
title = get_title(i)
if not os.path.exists(title):
print("正在下载:", title)
make_and_cd(title)
m = 1
for j in get_pic_link(i):
try:
download_pic(j, name = str(m)+".jpg")
except Exception as e:
print(e)
m += 1
print("下载成功")
os.chdir(cwd)
speed = progress/total_length * 100
print("已完成", speed, "%")

最后再把这个文件的后缀去掉,然后放到这个文件夹里面:
/Users/mr.cheng/anaconda3/bin/

这个文件夹的命令可以全局调用。
我把这个文件命名为:bthome,不过如果去掉后缀名可能还不行:

你需要想办法把这个文件变成所谓的Unix可执行文件。我经常的办法是找一个其它的Unix可执行文件创建一个拷贝(你可能无法在当前文件夹创建拷贝,复制到桌面上即可)把里面的内容换掉,然后名字改成自己想要的。

完成这些之后,以后只需要打开终端,然后输入bthome运行即可更新图片了。

# Python

评论

Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×