多线程爬取某个网页的所有图片

本文最后更新于 2024年10月2日 上午

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#coding=utf-8
import importlib
import sys
import urllib.request

from bs4 import BeautifulSoup, UnicodeDammit


def imageSpider(start_url):
try:
urls = []
req = urllib.request.Request(start_url,headers=headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, "lxml")
images = soup.select("img")
for image in images:
try:
src = image["src"]
url = urllib.request.urljoin(start_url, src)
if url not in urls:
urls.append(url)
print(url)
download(url)
except Exception as err:
print(err)
except Exception as err:
print(err)


def download(url):
global count
try:
count = count+1
if(url[len(url)-4] == "."):
ext = url[len(url)-4:]
else:
ext = ""
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req, timeout=100)
data = data.read()
fobj = open("images"+str(count)+ext, "wb")
fobj.write(data)
fobj.close()
print("downloaded "+str(count)+ext)
except Exception as err:
print(err)


start_url = "https://blog.xinhaojin.top"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
count = 0
importlib.reload(sys)
imageSpider(start_url)

在程序根目录下新建images文件夹即可


多线程爬取某个网页的所有图片
https://xinhaojin.github.io/2021/07/14/多线程爬取某个网页的所有图片/
作者
xinhaojin
发布于
2021年7月14日
更新于
2024年10月2日
许可协议