#coding = utf-8
'''
程 序 名:墨迹天气爬虫
编 写 人:bxgj
运行环境:win7x64 Python3.6.4
修改日志:2018.09.09 新建
2018.09.10 完成爬取主要天气信息功能
2018.09.12 完成爬取风力等信息的功能
2018.09.14 完成爬取空气质量信息的功能
2018.09.15 重构代码
2018.09.16 完成数据保存功能
2018.09.21 功能基本完成,修复部分细节问题
版 本:V1.2
备 注:由于最近没有极端天气,所以爬取高温预警、雷电预警之类的功能暂未完成
空气质量评定标准,AQI分级
优 0-50
良 51-100
轻度污染 101-150
中度污染 151-200
重度污染 201-300
严重污染 301-500
500以上爆表了
'''
# ---------------- 导入模块 ----------------
from bs4 import BeautifulSoup
import requests
import time
import datetime
import os
import re
import csv
# ---------------- 全局变量、初始化等 ----------------
# 要爬取的链接
wea_url = "https://tianqi.moji.com/weather/china/shaanxi/yanta-district"
aqi_url = "https://tianqi.moji.com/aqi/china/shaanxi/yanta-district"
'''
墨迹天气官方网站
https://tianqi.moji.com/
链接格式
天气预报 https://tianqi.moji.com/weather/china/省会/市、区、县等
空气质量 https://tianqi.moji.com/aqi/china/省会/市、区、县等
地级市,直接用拼音,如
https://tianqi.moji.com/weather/china/shaanxi/xian
市区,区名的拼音加-district,如
https://tianqi.moji.com/weather/china/shaanxi/yanta-district
县,县名的拼音加-county
镇,镇名的拼音加-town
其他的地名,如XX山、XX风景区、XX湖基本都是拼音加英文
特殊地名特殊处理,如
秦始皇陵
mausoleum-of-the-first-qin-emperor
陕西历史博物馆
shanxi-history-museum
'''
# 是否使用调试模式
# DEBUG_MODE = True
DEBUG_MODE = False
# ---------------- 类定义 ----------------
# 实时天气信息及预报天气信息
class weather():
def __init__(self, uptime=None, city="", weather="", temp=None,
humi=None, wind_dir="", wind_min=None, wind_max=None, tips="", description="",
aqi=None, aqi_level="", aqi_PM10=None, aqi_PM2P5=None, aqi_NO2=None, aqi_SO2=None, aqi_O3=None, aqi_CO=None,
other_info=""):
# 爬取时间
crawltime = datetime.datetime.now()
self.crawltime = crawltime.replace(microsecond=0)
# 天气基本信息
self.uptime = uptime # 这个时间指的是网站上显示的更新时间,不是爬取的时间
self.city = city
self.weather = weather
self.temp = temp
# 天气详细信息
self.humi = humi
self.wind_dir = wind_dir
self.wind_min = wind_min
self.wind_max = wind_max
self.tips = tips
self.description = description# 总体天气描述
# 空气质量
self.aqi = aqi
self.aqi_level = aqi_level
self.aqi_PM10 = aqi_PM10
self.aqi_PM2P5 = aqi_PM2P5
self.aqi_NO2 = aqi_NO2
self.aqi_SO2 = aqi_SO2
self.aqi_O3 = aqi_O3
self.aqi_CO = aqi_CO
# 天气其他信息,比如高温预警、雷电预警之类的
self.other_info = other_info
# 便于保存成文件什么的
def get_weather_info_list(self):
# return [self.datetime, self.city, self.weather, self.temp,
return [str(self.crawltime), str(self.uptime), self.city, self.weather, self.temp,
self.humi, self.wind_dir, self.wind_min, self.wind_max, self.tips, self.description,
self.aqi, self.aqi_level, self.aqi_PM10, self.aqi_PM2P5, self.aqi_NO2, self.aqi_SO2,
self.aqi_O3, self.aqi_CO, self.other_info]
def get_weather_info_str(self):
return [str(self.crawltime), str(self.uptime), str(self.city), str(self.weather), str(self.temp),
str(self.humi), self.wind_dir, str(self.wind_min), str(self.wind_max), str(self.tips), str(self.description),
str(self.aqi), self.aqi_level, str(self.aqi_PM10), str(self.aqi_PM2P5),
str(self.aqi_NO2), str(self.aqi_SO2), str(self.aqi_O3), str(self.aqi_CO),
str(self.other_info)]
#全部转换成纯字符串
def __str__(self):
return "|".join(self.get_weather_info_str()).replace("None", "")
# 天气预报信息,比较简略
class weather_forecast():
def __init__(self, uptime=None, city="", weather="", temp_min=None, temp_max=None,
wind_dir="", wind_min=None, wind_max=None, aqi=None, aqi_level=None, other_info=""):
# 爬取时间
crawltime = datetime.datetime.now()
self.crawltime = crawltime.replace(microsecond=0)
# 天气基本信息
self.uptime = uptime # 这个时间指的是网站上显示的更新时间,不是爬取的时间
self.city = city
self.weather = weather
self.temp_min = temp_min
self.temp_max = temp_max
# 天气详细信息
self.wind_dir = wind_dir
self.wind_min = wind_min
self.wind_max = wind_max
# 空气质量
self.aqi = aqi
self.aqi_level = aqi_level
# 天气其他信息,比如高温预警、雷电预警之类的
self.other_info = other_info
# 便于保存成文件什么的
def get_weather_info_list(self):
return [str(self.crawltime), str(self.uptime), self.city, self.weather, self.temp_min, self.temp_max,
self.wind_dir, self.wind_min, self.wind_max, self.aqi, self.aqi_level, self.other_info]
def get_weather_info_str(self):
return [str(self.crawltime), str(self.uptime), str(self.city),
str(self.weather), str(self.temp_min), str(self.temp_max),
str(self.wind_dir), str(self.wind_min), str(self.wind_max),
str(self.aqi), str(self.aqi_level), str(self.other_info)]
#全部转换成纯字符串
def __str__(self):
return "|".join(self.get_weather_info_str()).replace("None", "")
# ---------------- 函数定义 ----------------
# 天气爬虫
def weather_spider(weather_info, weather_forecast_info, url):
if (DEBUG_MODE):
# 本地测试文件
with open("yanta.html", 'r', encoding='utf-8') as html_file:
html_text = html_file.read()
soup = BeautifulSoup(html_text, "lxml")
else:
try:
web_data = requests.get(url)
except requests.exceptions.ConnectionError:
print("网络连接异常")
return
except Exception:
print("其他异常")
return
if (web_data.status_code != requests.codes.ok):
print("服务器响应异常", web_data.status_code)
return
soup = BeautifulSoup(web_data.text, "lxml")
# 城市信息
city_tag = soup.select(".search_default > em")
city_name = city_tag[0].get_text().replace(' ', '')
weather_info.city = city_name # 保存城市信息
# 发布时间
uptime_tag = soup.find("strong", class_="info_uptime")
uptime_str = uptime_tag.get_text()
uptime_str = re.search(r"(\d+):(\d+)", uptime_str).groups() # 用正则提取时分
tmp_datetime = datetime.datetime.now()
uptime = tmp_datetime.replace(hour=int(uptime_str[0]), minute=int(uptime_str[1]),
second=0, microsecond=0)
weather_info.uptime = uptime
# 获取描述信息
description_tag = soup.select('meta[name="description"]')
# 用tag的get方法获取指定属性的值
weather_description = description_tag[0].get("content")
weather_description.replace(" ", "").replace(",", ",") # 清除多余空格,替换英文标点符号
weather_info.description = weather_description
# 抓取实时天气信息
tmp_tag = soup.find("div", class_="wea_weather clearfix")
weather_info.temp = float(tmp_tag.em.string) # em标签中是温度
weather_info.weather = tmp_tag.b.string # b标签中是天气
# 这个标签下是湿度和风力信息
tmp_tag = soup.find("div", class_="wea_about clearfix")
humi_str = tmp_tag.span.string # span标签下是湿度
humi_value = re.search(r"(\d+)", humi_str).group()
weather_info.humi = int(humi_value)
wind_str = tmp_tag.em.string # span标签下是风力
try:
wind_dir = re.search(r"([东西南北微无台]+风)", wind_str).group()
except AttributeError:
print("可能是风力等级有新的汉字,原始数据:", wind_str)
else:
weather_info.wind_dir = wind_dir
wind_min = re.search(r"(\d+)", wind_str).group()
weather_info.wind_min = wind_min
tmp_tag = soup.find("div", class_="wea_tips clearfix")
weather_info.tips = tmp_tag.em.string
# 3天预报爬虫
tmp_tag = soup.find_all("ul", class_ = "days clearfix")
day_num = 0
for oneday in tmp_tag:
items_tag = oneday.find_all("li")
# 城市和更新时间前面已经爬了,这里直接用
weather_forecast_info[day_num].uptime = weather_info.uptime
weather_forecast_info[day_num].city = weather_info.city
# 0 今天、明天、后天
# 略,不爬
# 1 天气
weather_forecast_info[day_num].weather = re.sub(r"\s", "", items_tag[1].get_text())
# 2 温度
temps = re.findall(r"(\d+)", items_tag[2].get_text())
weather_forecast_info[day_num].temp_min = int(temps[0])
weather_forecast_info[day_num].temp_max = int(temps[1])
# 3 风力
weather_forecast_info[day_num].wind_dir = items_tag[3].em.get_text() # 风向
wind_values = re.findall(r"(\d+)", items_tag[3].b.get_text()) # 风力
if (len(wind_values) == 1):
weather_forecast_info[day_num].wind_min = int(wind_values[0])
elif (len(wind_values) == 2):
weather_forecast_info[day_num].wind_min = int(wind_values[0])
weather_forecast_info[day_num].wind_max = int(wind_values[1])
else:
print("解析风力级数出错")
# 4 空气质量
aqi_str = re.sub(r"\s", "", items_tag[4].get_text())
weather_forecast_info[day_num].aqi = int(re.search(r"(\d+)", aqi_str).group())
weather_forecast_info[day_num].aqi_level = re.search(r"(\D+)", aqi_str).group()
day_num += 1
# 空气污染爬虫
def aqi_spider(weather_info, url):
if (DEBUG_MODE):
# 本地测试文件
with open("air.html", 'r', encoding='utf-8') as html_file:
html_text = html_file.read()
soup = BeautifulSoup(html_text, "lxml")
else:
try:
web_data = requests.get(url)
except requests.exceptions.ConnectionError:
print("网络连接异常")
return
except Exception:
print("其他异常")
return
if (web_data.status_code != requests.codes.ok):
print("服务器响应异常", web_data.status_code)
return
soup = BeautifulSoup(web_data.text, "lxml")
# 通过id查找空气质量和等级
aqi_value_tag = soup.select("#aqi_value")
weather_info.aqi = int(aqi_value_tag[0].get_text())
aqi_desc_tag = soup.select("#aqi_desc")
weather_info.aqi_level = aqi_desc_tag[0].get_text()
# 爬取空气质量详情
aqi_info_item = soup.find("ul", class_="clearfix")
# 获取空气质量条目的名称
aqi_info_name_list = []
aqi_info_name_tag = aqi_info_item.find_all("em")
for name in aqi_info_name_tag:
aqi_info_name_list.append("".join(name.strings))
# 获取空气质量条目的数值
aqi_info_value_list = []
aqi_info_value_tag = aqi_info_item.find_all("span")
for value in aqi_info_value_tag:
aqi_info_value_list.append(int(value.string))
# 空气质量合并为字典
aqi_info_list = dict(zip(aqi_info_name_list, aqi_info_value_list))
# print(aqi_info_list)
# 保存
weather_info.aqi_PM10 = aqi_info_list["PM10"]
weather_info.aqi_PM2P5 = aqi_info_list["PM2.5"]
weather_info.aqi_NO2 = aqi_info_list["NO2"]
weather_info.aqi_SO2 = aqi_info_list["SO2"]
weather_info.aqi_O3 = aqi_info_list["O3"]
weather_info.aqi_CO = aqi_info_list["CO"]
# 发布时间
# 这个发布时间我感觉严重滞后,就不用了
# aqi_info_time_raw = soup.find(class_="aqi_info_time")
# aqi_info_time = aqi_info_time_raw.b.string
# # print(aqi_info_time)
# uptime = re.search(r"(\d+)年(\d+)月(\d+)日 (\d+)时(\d+)分", aqi_info_time).groups()
# print(uptime)
# 主函数
def main():
save_path = ""
this_path = os.path.realpath(__file__)
dir_path = os.path.dirname(this_path)
save_path = os.path.join(dir_path, "weather_data")
if (DEBUG_MODE):
save_csv_filename = os.path.join(save_path, "weather_debug.csv")
save_txt_filename = os.path.join(save_path, "weather_debug.txt")
else:
save_csv_filename = os.path.join(save_path, "weather.csv")
save_txt_filename = os.path.join(save_path, "weather.txt")
if not (os.path.exists(save_path)):
os.mkdir(save_path)
if not (os.path.exists(save_csv_filename)):
with open(save_csv_filename, "w", newline = "", encoding="GBK") as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["爬取时间", "更新时间", "地区", "天气", "温度",
"湿度", "风向", "最小风力", "最大风力", "小贴士", "描述",
"空气质量指数", "空气质量等级", "PM10", "PM2.5", "NO2", "SO2", "O3", "CO", "其他信息"])
count = 0
while (True):
# 爬实时天气及3天预报
all_weather_info = weather()
all_weather_forecast_info = [weather_forecast(), weather_forecast(), weather_forecast()]
weather_spider(all_weather_info, all_weather_forecast_info, wea_url)
aqi_spider(all_weather_info, aqi_url)
print(all_weather_info.get_weather_info_list())
for info in all_weather_forecast_info:
print(info.get_weather_info_list())
# 保存到csv,方便查阅历史天气
with open(save_csv_filename, "a", newline = "", encoding="GBK") as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(all_weather_info.get_weather_info_list())
# 保存到最新天气信息,可供其他程序使用
with open(save_txt_filename, "w", encoding="GBK") as txt_file:
txt_file.write(str(all_weather_info) + "\n")
for item in all_weather_forecast_info:
txt_file.write(str(item) + "\n")
count += 1
print("第%d次爬取完成,待机中……\n" %(count))
time.sleep(2700)
return
# ---------------- 程序入口 ----------------
if (__name__ == "__main__"):
main()
一键复制
编辑
Web IDE
原始数据
按行查看
历史