当前位置: 首页 > 工具软件 > 51妹子图 > 使用案例 >

爬取妹子图片

杨宏儒
2023-12-01

爬取图片有一个要注意的点是要对jpg那条链接在解析一次然后.write写入文件。

import requests
import re

'''深入的源代码才能发现契机~'''

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}

path = 'D:/mmphoto/'
urls = ['https://www.51xw.net/meizi/hot/page/{}'.format(i) for i in range(1,3)]
for url in urls:
    r = requests.get(url,headers = headers)
    Big = re.findall('<span><a href="(.*?)" target="_blank">',r.text)
    for small in Big:
        for j in range(1,7):
            link = small + '/' + str(j)
            photo_html= requests.get(link,headers = headers)
            specific_photo = re.findall('<img src="(.*?)" alt',photo_html.text)
            k = 1
            for x in specific_photo:
                data = requests.get(x,headers = headers)
                fp = open(path + str(k) + x[-10:],'wb')#注意写进文件时,切片的字符串名字不可有/出现
                fp.write(data.content)
                fp.close()
                k += 1


爬取过程很慢。。大概1,2十分钟。
。。。结果就不展示了吧,,羞耻。。。
用了四重循环,第一个大循环是遍历所有大页面的url,第二重循环是在大页面的网页进入具体的主题。。羞耻。。第三个循环是在具体的主题里切换页面1,2,3,4。第四个循环是把图片写入文件。。话说网站为啥要把图片藏得那么深~滑稽

2月15日更新,上新代码

import os
import requests
import re
from lxml import etree
from random import randint

headers = {
    'referer': 'https://www.mzitu.com/168151',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}


def get_html(url):
    try:
        content = requests.get(url, headers=headers)
        if content.status_code == 200:
            return content.content.decode()# .text不好,无法避免gbk编码的错误
        else:
            return None
    except:
        return None


def get_entrance_url(html):
    selector = etree.HTML(html)
    url_list = selector.xpath('//ul[@id="pins"]/li/a/@href')
    return url_list


def get_next_page_first(html):
    next_page_url = re.findall(
        '<a class="next page-numbers" href="(.*?)">下一页', html, re.S)
    return next_page_url


def get_info(html):
    selector = etree.HTML(html)
    photo_link = selector.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
    file_name = selector.xpath('//title/text()')[0]
    # print(file_name)
    # print(photo_link)
    path = 'D:/shepi/'
    file_name = file_name[0:10]
    real_path = path + file_name
    response = requests.get(photo_link, headers=headers)
    name = photo_link[-7:]
    with open(real_path + name, 'wb') as fp:
        fp.write(response.content)
    # if os.path.exists(path + file_name):
        
    # else:
    #     os.mkdir(path + file_name)
    #     real_path = path + file_name
    #     response = requests.get(photo_link, headers=headers)
    #     name = photo_link[-7:]
    #     with open(real_path + name, 'wb+') as fp:
    #         fp.write(response.content)

    next_page = selector.xpath("//div[@class='pagenavi']/a[6]/@href")
    print(next_page)
    if next_page:
    	data = get_html(next_page[0])
    	get_info(data)


if __name__ == '__main__':
    url = 'https://www.mzitu.com'
    html = get_html(url)
    url_list = get_entrance_url(html)
    # get_info(html)
    for link in url_list:
        # print(link) 测试成功
        html = get_html(link)
        get_info(html)
        
 类似资料: