墨迹天气接口html,moji_weather_spider.py

谢璞

2023-12-01

#coding = utf-8

'''

程序名：墨迹天气爬虫

编写人：bxgj

运行环境：win7x64 Python3.6.4

修改日志：2018.09.09 新建

2018.09.10 完成爬取主要天气信息功能

2018.09.12 完成爬取风力等信息的功能

2018.09.14 完成爬取空气质量信息的功能

2018.09.15 重构代码

2018.09.16 完成数据保存功能

2018.09.21 功能基本完成，修复部分细节问题

版本：V1.2

备注：由于最近没有极端天气，所以爬取高温预警、雷电预警之类的功能暂未完成

空气质量评定标准，AQI分级

优 0-50

良 51-100

轻度污染 101-150

中度污染 151-200

重度污染 201-300

严重污染 301-500

500以上爆表了

'''

# ---------------- 导入模块 ----------------

from bs4 import BeautifulSoup

import requests

import time

import datetime

import os

import re

import csv

# ---------------- 全局变量、初始化等 ----------------

# 要爬取的链接

wea_url = "https://tianqi.moji.com/weather/china/shaanxi/yanta-district"

aqi_url = "https://tianqi.moji.com/aqi/china/shaanxi/yanta-district"

'''

墨迹天气官方网站

https://tianqi.moji.com/

链接格式

天气预报 https://tianqi.moji.com/weather/china/省会/市、区、县等

空气质量 https://tianqi.moji.com/aqi/china/省会/市、区、县等

地级市，直接用拼音，如

https://tianqi.moji.com/weather/china/shaanxi/xian

市区，区名的拼音加-district，如

https://tianqi.moji.com/weather/china/shaanxi/yanta-district

县，县名的拼音加-county

镇，镇名的拼音加-town

其他的地名，如XX山、XX风景区、XX湖基本都是拼音加英文

特殊地名特殊处理，如

秦始皇陵

mausoleum-of-the-first-qin-emperor

陕西历史博物馆

shanxi-history-museum

'''

# 是否使用调试模式

# DEBUG_MODE = True

DEBUG_MODE = False

# ---------------- 类定义 ----------------

# 实时天气信息及预报天气信息

class weather():

def __init__(self, uptime=None, city="", weather="", temp=None,

humi=None, wind_dir="", wind_min=None, wind_max=None, tips="", description="",

aqi=None, aqi_level="", aqi_PM10=None, aqi_PM2P5=None, aqi_NO2=None, aqi_SO2=None, aqi_O3=None, aqi_CO=None,

other_info=""):

# 爬取时间

crawltime = datetime.datetime.now()

self.crawltime = crawltime.replace(microsecond=0)

# 天气基本信息

self.uptime = uptime # 这个时间指的是网站上显示的更新时间，不是爬取的时间

self.city = city

self.weather = weather

self.temp = temp

# 天气详细信息

self.humi = humi

self.wind_dir = wind_dir

self.wind_min = wind_min

self.wind_max = wind_max

self.tips = tips

self.description = description# 总体天气描述

# 空气质量

self.aqi = aqi

self.aqi_level = aqi_level

self.aqi_PM10 = aqi_PM10

self.aqi_PM2P5 = aqi_PM2P5

self.aqi_NO2 = aqi_NO2

self.aqi_SO2 = aqi_SO2

self.aqi_O3 = aqi_O3

self.aqi_CO = aqi_CO

# 天气其他信息，比如高温预警、雷电预警之类的

self.other_info = other_info

# 便于保存成文件什么的

def get_weather_info_list(self):

# return [self.datetime, self.city, self.weather, self.temp,

return [str(self.crawltime), str(self.uptime), self.city, self.weather, self.temp,

self.humi, self.wind_dir, self.wind_min, self.wind_max, self.tips, self.description,

self.aqi, self.aqi_level, self.aqi_PM10, self.aqi_PM2P5, self.aqi_NO2, self.aqi_SO2,

self.aqi_O3, self.aqi_CO, self.other_info]

def get_weather_info_str(self):

return [str(self.crawltime), str(self.uptime), str(self.city), str(self.weather), str(self.temp),

str(self.humi), self.wind_dir, str(self.wind_min), str(self.wind_max), str(self.tips), str(self.description),

str(self.aqi), self.aqi_level, str(self.aqi_PM10), str(self.aqi_PM2P5),

str(self.aqi_NO2), str(self.aqi_SO2), str(self.aqi_O3), str(self.aqi_CO),

str(self.other_info)]

#全部转换成纯字符串

def __str__(self):

return "|".join(self.get_weather_info_str()).replace("None", "")

# 天气预报信息，比较简略

class weather_forecast():

def __init__(self, uptime=None, city="", weather="", temp_min=None, temp_max=None,

wind_dir="", wind_min=None, wind_max=None, aqi=None, aqi_level=None, other_info=""):

# 爬取时间

crawltime = datetime.datetime.now()

self.crawltime = crawltime.replace(microsecond=0)

# 天气基本信息

self.uptime = uptime # 这个时间指的是网站上显示的更新时间，不是爬取的时间

self.city = city

self.weather = weather

self.temp_min = temp_min

self.temp_max = temp_max

# 天气详细信息

self.wind_dir = wind_dir

self.wind_min = wind_min

self.wind_max = wind_max

# 空气质量

self.aqi = aqi

self.aqi_level = aqi_level

# 天气其他信息，比如高温预警、雷电预警之类的

self.other_info = other_info

# 便于保存成文件什么的

def get_weather_info_list(self):

return [str(self.crawltime), str(self.uptime), self.city, self.weather, self.temp_min, self.temp_max,

self.wind_dir, self.wind_min, self.wind_max, self.aqi, self.aqi_level, self.other_info]

def get_weather_info_str(self):

return [str(self.crawltime), str(self.uptime), str(self.city),

str(self.weather), str(self.temp_min), str(self.temp_max),

str(self.wind_dir), str(self.wind_min), str(self.wind_max),

str(self.aqi), str(self.aqi_level), str(self.other_info)]

#全部转换成纯字符串

def __str__(self):

return "|".join(self.get_weather_info_str()).replace("None", "")

# ---------------- 函数定义 ----------------

# 天气爬虫

def weather_spider(weather_info, weather_forecast_info, url):

if (DEBUG_MODE):

# 本地测试文件

with open("yanta.html", 'r', encoding='utf-8') as html_file:

html_text = html_file.read()

soup = BeautifulSoup(html_text, "lxml")

else:

try:

web_data = requests.get(url)

except requests.exceptions.ConnectionError:

print("网络连接异常")

return

except Exception:

print("其他异常")

return

if (web_data.status_code != requests.codes.ok):

print("服务器响应异常", web_data.status_code)

return

soup = BeautifulSoup(web_data.text, "lxml")

# 城市信息

city_tag = soup.select(".search_default > em")

city_name = city_tag[0].get_text().replace(' ', '')

weather_info.city = city_name # 保存城市信息

# 发布时间

uptime_tag = soup.find("strong", class_="info_uptime")

uptime_str = uptime_tag.get_text()

uptime_str = re.search(r"(\d+):(\d+)", uptime_str).groups() # 用正则提取时分

tmp_datetime = datetime.datetime.now()

uptime = tmp_datetime.replace(hour=int(uptime_str[0]), minute=int(uptime_str[1]),

second=0, microsecond=0)

weather_info.uptime = uptime

# 获取描述信息

description_tag = soup.select('meta[name="description"]')

# 用tag的get方法获取指定属性的值

weather_description = description_tag[0].get("content")

weather_description.replace(" ", "").replace(",", "，") # 清除多余空格，替换英文标点符号

weather_info.description = weather_description

# 抓取实时天气信息

tmp_tag = soup.find("div", class_="wea_weather clearfix")

weather_info.temp = float(tmp_tag.em.string) # em标签中是温度

weather_info.weather = tmp_tag.b.string # b标签中是天气

# 这个标签下是湿度和风力信息

tmp_tag = soup.find("div", class_="wea_about clearfix")

humi_str = tmp_tag.span.string # span标签下是湿度

humi_value = re.search(r"(\d+)", humi_str).group()

weather_info.humi = int(humi_value)

wind_str = tmp_tag.em.string # span标签下是风力

try:

wind_dir = re.search(r"([东西南北微无台]+风)", wind_str).group()

except AttributeError:

print("可能是风力等级有新的汉字，原始数据：", wind_str)

else:

weather_info.wind_dir = wind_dir

wind_min = re.search(r"(\d+)", wind_str).group()

weather_info.wind_min = wind_min

tmp_tag = soup.find("div", class_="wea_tips clearfix")

weather_info.tips = tmp_tag.em.string

# 3天预报爬虫

tmp_tag = soup.find_all("ul", class_ = "days clearfix")

day_num = 0

for oneday in tmp_tag:

items_tag = oneday.find_all("li")

# 城市和更新时间前面已经爬了，这里直接用

weather_forecast_info[day_num].uptime = weather_info.uptime

weather_forecast_info[day_num].city = weather_info.city

# 0 今天、明天、后天

# 略，不爬

# 1 天气

weather_forecast_info[day_num].weather = re.sub(r"\s", "", items_tag[1].get_text())

# 2 温度

temps = re.findall(r"(\d+)", items_tag[2].get_text())

weather_forecast_info[day_num].temp_min = int(temps[0])

weather_forecast_info[day_num].temp_max = int(temps[1])

# 3 风力

weather_forecast_info[day_num].wind_dir = items_tag[3].em.get_text() # 风向

wind_values = re.findall(r"(\d+)", items_tag[3].b.get_text()) # 风力

if (len(wind_values) == 1):

weather_forecast_info[day_num].wind_min = int(wind_values[0])

elif (len(wind_values) == 2):

weather_forecast_info[day_num].wind_min = int(wind_values[0])

weather_forecast_info[day_num].wind_max = int(wind_values[1])

else:

print("解析风力级数出错")

# 4 空气质量

aqi_str = re.sub(r"\s", "", items_tag[4].get_text())

weather_forecast_info[day_num].aqi = int(re.search(r"(\d+)", aqi_str).group())

weather_forecast_info[day_num].aqi_level = re.search(r"(\D+)", aqi_str).group()

day_num += 1

# 空气污染爬虫

def aqi_spider(weather_info, url):

if (DEBUG_MODE):

# 本地测试文件

with open("air.html", 'r', encoding='utf-8') as html_file:

html_text = html_file.read()

soup = BeautifulSoup(html_text, "lxml")

else:

try:

web_data = requests.get(url)

except requests.exceptions.ConnectionError:

print("网络连接异常")

return

except Exception:

print("其他异常")

return

if (web_data.status_code != requests.codes.ok):

print("服务器响应异常", web_data.status_code)

return

soup = BeautifulSoup(web_data.text, "lxml")

# 通过id查找空气质量和等级

aqi_value_tag = soup.select("#aqi_value")

weather_info.aqi = int(aqi_value_tag[0].get_text())

aqi_desc_tag = soup.select("#aqi_desc")

weather_info.aqi_level = aqi_desc_tag[0].get_text()

# 爬取空气质量详情

aqi_info_item = soup.find("ul", class_="clearfix")

# 获取空气质量条目的名称

aqi_info_name_list = []

aqi_info_name_tag = aqi_info_item.find_all("em")

for name in aqi_info_name_tag:

aqi_info_name_list.append("".join(name.strings))

# 获取空气质量条目的数值

aqi_info_value_list = []

aqi_info_value_tag = aqi_info_item.find_all("span")

for value in aqi_info_value_tag:

aqi_info_value_list.append(int(value.string))

# 空气质量合并为字典

aqi_info_list = dict(zip(aqi_info_name_list, aqi_info_value_list))

# print(aqi_info_list)

# 保存

weather_info.aqi_PM10 = aqi_info_list["PM10"]

weather_info.aqi_PM2P5 = aqi_info_list["PM2.5"]

weather_info.aqi_NO2 = aqi_info_list["NO2"]

weather_info.aqi_SO2 = aqi_info_list["SO2"]

weather_info.aqi_O3 = aqi_info_list["O3"]

weather_info.aqi_CO = aqi_info_list["CO"]

# 发布时间

# 这个发布时间我感觉严重滞后，就不用了

# aqi_info_time_raw = soup.find(class_="aqi_info_time")

# aqi_info_time = aqi_info_time_raw.b.string

# # print(aqi_info_time)

# uptime = re.search(r"(\d+)年(\d+)月(\d+)日 (\d+)时(\d+)分", aqi_info_time).groups()

# print(uptime)

# 主函数

def main():

save_path = ""

this_path = os.path.realpath(__file__)

dir_path = os.path.dirname(this_path)

save_path = os.path.join(dir_path, "weather_data")

if (DEBUG_MODE):

save_csv_filename = os.path.join(save_path, "weather_debug.csv")

save_txt_filename = os.path.join(save_path, "weather_debug.txt")

else:

save_csv_filename = os.path.join(save_path, "weather.csv")

save_txt_filename = os.path.join(save_path, "weather.txt")

if not (os.path.exists(save_path)):

os.mkdir(save_path)

if not (os.path.exists(save_csv_filename)):

with open(save_csv_filename, "w", newline = "", encoding="GBK") as csv_file:

csv_writer = csv.writer(csv_file)

csv_writer.writerow(["爬取时间", "更新时间", "地区", "天气", "温度",

"湿度", "风向", "最小风力", "最大风力", "小贴士", "描述",

"空气质量指数", "空气质量等级", "PM10", "PM2.5", "NO2", "SO2", "O3", "CO", "其他信息"])

count = 0

while (True):

# 爬实时天气及3天预报

all_weather_info = weather()

all_weather_forecast_info = [weather_forecast(), weather_forecast(), weather_forecast()]

weather_spider(all_weather_info, all_weather_forecast_info, wea_url)

aqi_spider(all_weather_info, aqi_url)

print(all_weather_info.get_weather_info_list())

for info in all_weather_forecast_info:

print(info.get_weather_info_list())

# 保存到csv，方便查阅历史天气

with open(save_csv_filename, "a", newline = "", encoding="GBK") as csv_file:

csv_writer = csv.writer(csv_file)

csv_writer.writerow(all_weather_info.get_weather_info_list())

# 保存到最新天气信息，可供其他程序使用

with open(save_txt_filename, "w", encoding="GBK") as txt_file:

txt_file.write(str(all_weather_info) + "\n")

for item in all_weather_forecast_info:

txt_file.write(str(item) + "\n")

count += 1

print("第%d次爬取完成，待机中……\n" %(count))

time.sleep(2700)

return

# ---------------- 程序入口 ----------------

if (__name__ == "__main__"):

main()

一键复制

编辑

Web IDE

原始数据

按行查看

历史

墨迹天气接口html,moji_weather_spider.py

相关阅读

相关文章

相关问答

相关文档