1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
| import importlib import sys import urllib.request
from bs4 import BeautifulSoup, UnicodeDammit
def imageSpider(start_url): try: urls = [] req = urllib.request.Request(start_url,headers=headers) data = urllib.request.urlopen(req) data = data.read() dammit = UnicodeDammit(data, ["utf-8", "gbk"]) data = dammit.unicode_markup soup = BeautifulSoup(data, "lxml") images = soup.select("img") for image in images: try: src = image["src"] url = urllib.request.urljoin(start_url, src) if url not in urls: urls.append(url) print(url) download(url) except Exception as err: print(err) except Exception as err: print(err)
def download(url): global count try: count = count+1 if(url[len(url)-4] == "."): ext = url[len(url)-4:] else: ext = "" req = urllib.request.Request(url, headers=headers) data = urllib.request.urlopen(req, timeout=100) data = data.read() fobj = open("images"+str(count)+ext, "wb") fobj.write(data) fobj.close() print("downloaded "+str(count)+ext) except Exception as err: print(err)
start_url = "https://blog.xinhaojin.top" headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"} count = 0 importlib.reload(sys) imageSpider(start_url)
|