因为不熟悉的缘故,总觉得写的不好,一种异步代码与同步代码参杂着的感觉,希望能够得到指点。
import os
import re
import aiofiles
import aiohttp
import asyncio
from lxml import etree
# 发请求获取html文本
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
# 解析html获取每组的列表页链接
async def parser(html):
tree = etree.HTML(html)
pic_href_list = tree.xpath('//*[@class="listbox"]/a/@href')
pic_title_list = tree.xpath('//*[@class="listbox"]/a/@title')
for href, title in zip(pic_href_list, pic_title_list):
path_id = re.findall('\d+', href)[0]
dir_path = os.path.join(os.getcwd(), 'zdqx', f"{path_id}_{title}")
if not os.path.exists(dir_path):
os.makedirs(dir_path)
yield 'http://' + href[2:], dir_path
# 获取每组图片的所有的链接
async def detail_parser(html):
tree = etree.HTML(html)
src_list = tree.xpath('//div[@class="img-box"]/div/a/img/@src')
return src_list[:-1]
# 下载图片并用异步文件库aiofiles进行存储
async def content(session, url, dir_path):
async with session.get(url) as response:
img = await response.read()
async with aiofiles.open(dir_path, mode='wb') as f:
await f.write(img)
await f.close()
async def download(url):
async with aiohttp.ClientSession() as session:
html_text = await fetch(session, url)
async for detail_url, dir_path in parser(html_text):
detail_text = await fetch(session, detail_url)
src_list = await detail_parser(detail_text)
for index, src in enumerate(src_list):
file_path = os.path.join(dir_path, f"{index}.jpg")
if not os.path.exists(file_path):
try:
await content(session, src, file_path)
except AssertionError as e:
print(e)
finally:
print(src)
if __name__ == '__main__':
urls = ['http://www.zdqx.com/qingchun/index.html'] + [f'http://www.zdqx.com/qingchun/index_{i}.html' for i in
range(2, 41)]
loop = asyncio.get_event_loop()
tasks = [asyncio.ensure_future(download(url)) for url in urls]
tasks = asyncio.gather(*tasks)
loop.run_until_complete(tasks)
复制代码