当前位置: 首页 > 工具软件 > 51妹子图 > 使用案例 >

爬虫实战22:多线程爬取豆瓣妹子图

曹和正
2023-12-01
from threading import Thread
import urllib.request
import bs4
import urllib.error
from queue import Queue
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"}


class productor(Thread):
    def __init__(self, pageQueue, imgQueue):
        super(productor, self).__init__()
        self.pageQueue = pageQueue
        self.imgQueue = imgQueue

    def run(self):
        while True:
            if self.pageQueue.empty():
                break
            url = self.pageQueue.get()
            print(url)
            request = urllib.request.Request(url, headers=headers)
            response = urllib.request.urlopen(request)
            html = response.read().decode("utf-8")
            soup = bs4.BeautifulSoup(html, "lxml")
            pic_lists = soup.find('ul', attrs={"class": "thumbnails"})
            pic_list = pic_lists.find_all('li')
            for pics in pic_list:
                pic_url = pics.find('img')['src']
                title = pics.find('img')['title']
                title = title.replace('.', '').replace('?', '').replace('*', '').replace('/', '').replace('\\', '').replace('。', '').replace(',', '')
                imgQueue.put((pic_url, title))


class consumor(Thread):
    def __init__(self, pageQueue, imgQueue):
        Thread.__init__(self)
        self.pageQueue = pageQueue
        self.imgQueue = imgQueue

    def run(self):
        while True:
            if self.pageQueue.empty() and self.imgQueue.empty():
                break
            pic_url, title = self.imgQueue.get()
            urllib.request.urlretrieve(pic_url, "D:\工作\python\spider\images\\" + title + ".jpg")
            print('下载成功!')


if __name__ == '__main__':
    pageQueue = Queue(50)
    for i in range(1, 51):
        url = "https://www.dbmeinv.com/?pager_offset=" + str(i)
        pageQueue.put(url)
    imgQueue = Queue(1000)
    for j in range(5):
        thread = productor(pageQueue, imgQueue)
        thread.start()
        thread.join()
    for j in range(5):
        thread = consumor(pageQueue, imgQueue)
        thread.start()
        thread.join()


 类似资料: