爬取图片有一个要注意的点是要对jpg那条链接在解析一次然后.write写入文件。
import requests
import re
'''深入的源代码才能发现契机~'''
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
path = 'D:/mmphoto/'
urls = ['https://www.51xw.net/meizi/hot/page/{}'.format(i) for i in range(1,3)]
for url in urls:
r = requests.get(url,headers = headers)
Big = re.findall('<span><a href="(.*?)" target="_blank">',r.text)
for small in Big:
for j in range(1,7):
link = small + '/' + str(j)
photo_html= requests.get(link,headers = headers)
specific_photo = re.findall('<img src="(.*?)" alt',photo_html.text)
k = 1
for x in specific_photo:
data = requests.get(x,headers = headers)
fp = open(path + str(k) + x[-10:],'wb')#注意写进文件时,切片的字符串名字不可有/出现
fp.write(data.content)
fp.close()
k += 1
爬取过程很慢。。大概1,2十分钟。
。。。结果就不展示了吧,,羞耻。。。
用了四重循环,第一个大循环是遍历所有大页面的url,第二重循环是在大页面的网页进入具体的主题。。羞耻。。第三个循环是在具体的主题里切换页面1,2,3,4。第四个循环是把图片写入文件。。话说网站为啥要把图片藏得那么深~滑稽
2月15日更新,上新代码
import os
import requests
import re
from lxml import etree
from random import randint
headers = {
'referer': 'https://www.mzitu.com/168151',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
def get_html(url):
try:
content = requests.get(url, headers=headers)
if content.status_code == 200:
return content.content.decode()# .text不好,无法避免gbk编码的错误
else:
return None
except:
return None
def get_entrance_url(html):
selector = etree.HTML(html)
url_list = selector.xpath('//ul[@id="pins"]/li/a/@href')
return url_list
def get_next_page_first(html):
next_page_url = re.findall(
'<a class="next page-numbers" href="(.*?)">下一页', html, re.S)
return next_page_url
def get_info(html):
selector = etree.HTML(html)
photo_link = selector.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
file_name = selector.xpath('//title/text()')[0]
# print(file_name)
# print(photo_link)
path = 'D:/shepi/'
file_name = file_name[0:10]
real_path = path + file_name
response = requests.get(photo_link, headers=headers)
name = photo_link[-7:]
with open(real_path + name, 'wb') as fp:
fp.write(response.content)
# if os.path.exists(path + file_name):
# else:
# os.mkdir(path + file_name)
# real_path = path + file_name
# response = requests.get(photo_link, headers=headers)
# name = photo_link[-7:]
# with open(real_path + name, 'wb+') as fp:
# fp.write(response.content)
next_page = selector.xpath("//div[@class='pagenavi']/a[6]/@href")
print(next_page)
if next_page:
data = get_html(next_page[0])
get_info(data)
if __name__ == '__main__':
url = 'https://www.mzitu.com'
html = get_html(url)
url_list = get_entrance_url(html)
# get_info(html)
for link in url_list:
# print(link) 测试成功
html = get_html(link)
get_info(html)