21. 网络爬虫阶段案例实战

优质

小牛编辑

150浏览

2023-12-01

任务：Ajax爬取今日头条的街拍美图
爬取url地址：https://www.toutiao.com/search_content/

分析：

分析url地址：https://www.toutiao.com/search_content/？每页20条数据，Ajax加载数据
需要提交参数：

    params = {
        'offset': offset, #页码数据
        'format': 'json',
        'keyword': '街拍', #搜索关键字
        'autoload': 'true',
        'count': '20',
        'cur_tab': '3',
        'from': 'gallery',
    }

具体实现代码如下：

import os,time
import requests
from urllib.parse import urlencode
from urllib.request import urlretrieve

def getPage(offset):
    '''爬取指定url页面信息'''
    params = {
        'offset': offset,
        'format': 'json',
        'keyword': '街拍',
        'autoload': 'true',
        'count': '20',
        'cur_tab': '3',
        'from': 'gallery',
    }
    url = 'https://www.toutiao.com/search_content/?' + urlencode(params)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
    except requests.ConnectionError:
        return None

def getImages(json):
    '''解析获取图片信息'''
    data = json.get('data')
    if data:
        for item in data:
            # print(item)
            image_list = item.get('image_list')
            title = item.get('title')
            # print(image_list)
            for image in image_list:
                yield {
                    'image': image.get('url'),
                    'title': title
                }

def saveImage(item):
    '''储存图片'''
    # 处理每组图片的存储路径
    path = os.path.join("./mypic/",item.get('title'))
    if not os.path.exists(path):
        os.mkdir(path)

    # 拼装原图和目标图片的路径即名称
    local_image_url = item.get('image')
    image_url = "http:"+local_image_url.replace('list','large')
    save_pic = path+"/"+local_image_url.split("/").pop()+".jpg"

    # 使用urllib中urlretrieve直接存储图片
    urlretrieve(image_url,save_pic)

def main(offset):
    ''' 主程序函数，负责调度执行爬虫处理 '''
    json = getPage(offset)
    for item in getImages(json):
        print(item)
        saveImage(item)

# 判断当前执行是否为主程序运行，并遍历调用主函数爬取数据
if __name__ == '__main__':
    #main(0)
    for i in range(5):
        main(offset=i*20)
        time.sleep(1)