python 公众号爬虫_「Python」 - 微信公众号爬虫

钱劲

2023-12-01

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

import time

import datetime

import random

import json

import base64

import re

import requests

import urllib3

import logging.handlers

from flask import Flask, request

from flask import jsonify

from html.parser import HTMLParser

from peewee import * # pylint: disable=unused-import

# 禁用requests提交https请求时的报错

urllib3.disable_warnings()

# LOG相关 START#############################################################

rq = time.strftime('%Y-%m-%d', time.localtime(time.time()))

LOG_FILE = 'info.%s.log' % (rq)

handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes=1024 * 1024 * 10, backupCount=5, encoding='utf-8') # 实例化handler

fmt = '[%(asctime)s] [%(funcName)s:%(filename)s:%(lineno)d]%(levelname)s-%(message)s'

formatter = logging.Formatter(fmt)

handler.setFormatter(formatter)

logger = logging.getLogger('info')

logger.addHandler(handler)

logger.setLevel(logging.DEBUG)

# LOG相关 END#############################################################

# DB相关 START#############################################################

db = MySQLDatabase("wechat_db", host="192.168.47.249", port=3306, user="root", passwd="root")

db.connect()

class BaseModel(Model):

class Meta:

database = db

class WechatAuthor(BaseModel):

id = PrimaryKeyField()

wx_id = CharField(null=False)

biz = CharField(null=False, unique=True)

nick_name = CharField(null=True)

status = IntegerField(null=True, default=0)

biz_id = BigIntegerField(null=True)

create_time = DateTimeField(null=True)

last_modified_date = DateTimeField(null=True)

class Meta:

db_table = 'wx_author'

class WechatContent(BaseModel):

id = PrimaryKeyField()

biz = CharField(null=False, index=True)

author = CharField(null=True)

title = CharField(null=False)

digest = CharField(null=True)

content_url = CharField(null=False)

source_url = CharField(null=True)

push_time = DateTimeField(null=True)

read_num = IntegerField(null=True)

like_num = IntegerField(null=True)

reward_num = IntegerField(null=True)

create_time = DateTimeField(null=True)

last_modified_date= DateTimeField(null=True)

class Meta:

db_table = 'wx_content'

# DB相关 END#############################################################

# HTTP相关 START#############################################################

attrs_list = list()

class MyHTMLParser(HTMLParser):

# 筛选标签，属性key包含uigs、account_name，其中href就是公众号url

# 一个页面可能包含多个公众号信息

def handle_starttag(self, tag, attrs):

if tag == 'a':

for attr in attrs:

if attr[0] == 'uigs' and 'account_name' in attr[1]:

attrs_list.append(attrs)

# wechat_name可以是wx_id也可以为nick_name

def get_sogou_html(wechat_name):

# 通过搜狗发送get请求

header = {'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}

sogou_url = 'http://weixin.sogou.com/weixin?query=' + wechat_name

response = requests.get(sogou_url, headers=header, verify=False)

logger.info('获取搜狗查询页面:%s-状态:%s' % (wechat_name, response.status_code))

return response.text

# 公众号url有访问频率限制，访问太快会触发验证码

def get_wechat_html(wechat_url):

header = {'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}

response = requests.get(wechat_url, headers=header, verify=False)

logger.info('获取微信公众号页面:%s-状态:%s' % (wechat_url, response.status_code))

# todo 处理验证码页面

return response.text

def get_wxurl_list(html):

parser = MyHTMLParser()

parser.feed(html)

wxurl_list = [t[2][1] for t in attrs_list]

return wxurl_list

def get_biz(wechat_name):

wx_info_list = list()

# 这里没有处理搜狗分页

html = get_sogou_html(wechat_name)

wxurl_list = get_wxurl_list(html)

for wechat_url in wxurl_list:

html = get_wechat_html(wechat_url)

# 如果获取微信html，触发验证码，这里不应该正则匹配

# 在js代码中，有biz，还有wx_id及nick_name

# wx_id可能为''

biz = re.findall(r'var biz = "(\S*)"', html)

nick_name = re.findall(r'var name="(\S*)"', html)

wx_info_list.append((biz[0], nick_name[0]))

logging.info('biz:%snick_name:%s' % (biz, nick_name))

time.sleep(random.randint(5, 20))

return wx_info_list

def headers_to_dict(headers):

"""将字符串'''Host: mp.weixin.qq.comConnection: keep-aliveCache-Control: max-age='''转换成字典类型:param headers: str:return: dict"""

headers = headers.split("\n")

d_headers = dict()

for h in headers:

h = h.strip()

if h:

k, v = h.split(":", 1)

d_headers[k] = v.strip()

return d_headers

def get_wx_content(biz):

# url需要实时抓APP数据包获得，其中几个value经常改变

url = 'https://mp.weixin.qq.com/mp/profile_ext?x5=0&is_ok=1&action=getmsg&scene=126&uin=777&key=777&f=json&count=10&' \

'__biz={}&offset={}&' \

'pass_ticket={}&' \

'appmsg_token={}'

# Cookie需要实时抓APP数据包获得

headers = {

'X-Requested-With': 'XMLHttpRequest',

'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Mobile/14G60 MicroMessenger/6.7.0 NetType/WIFI Language/zh_CN',

'Cookie': ''}

next_offset = 0

while True:

content_url = url.format(biz, next_offset)

response = requests.get(content_url, headers=headers, verify=False)

logger.info('content_url:%s- 状态:%s' % (content_url, response.status_code))

jsonstr = response.text

rs = json.loads(jsonstr)

# 标注是否有后续分页

can_msg_continue = rs['can_msg_continue']

next_offset = rs['next_offset']

general_msg_list = rs['general_msg_list']

rs = json.loads(general_msg_list)

for obj in rs['list']:

author = obj['app_msg_ext_info']['author']

title = obj['app_msg_ext_info']['title']

digest = obj['app_msg_ext_info']['digest']

content_url = obj['app_msg_ext_info']['content_url']

source_url = obj['app_msg_ext_info']['source_url']

now_time = datetime.datetime.now().strftime('%Y-%m-%d%H:%M:%S')

push_time = datetime.datetime.fromtimestamp(int(obj['comm_msg_info']['datetime'])).strftime('%Y-%m-%d%H:%M:%S')

WechatContent.create(biz=biz, author=author, title=title, digest=digest, content_url=content_url, source_url=source_url, push_time=push_time, read_num=0, like_num=0, reward_num=0, create_time=now_time, last_modified_date=now_time)

logger.info('%s-%s入库' % (title, author))

if obj['app_msg_ext_info']['is_multi'] == 1:

for sub_obj in obj['app_msg_ext_info']['multi_app_msg_item_list']:

author = sub_obj['author'],

title = sub_obj['title'],

digest = sub_obj['digest'],

content_url = sub_obj['content_url'],

source_url = sub_obj['source_url'],

logger.info('%s-%s入库' % (title, author))

if can_msg_continue == 0:

logger.info('%s历史文章获取入库完毕' % (biz))

break

time.sleep(random.randint(5, 20))

# HTTP相关 END#############################################################

# WEB相关 START############################################################

# 定义返回码

R200_OK = {'code': 200, 'message': 'OK'}

R500_ERROR = {'code': 500, 'message': 'ERROR'}

def statusResponse(statu_dic):

return jsonify({'status': statu_dic})

def fullResponse(statu_dic, data):

return jsonify({'status': statu_dic, 'data': data})

app = Flask(__name__)

# 使用rest风格接收HTTP传递的wx_name

@app.route('/wechat/', methods=['PUT', 'GET'])

def wechat_content(wx_name):

logger.info('获取wechat_name=%s相关信息开始' % wx_name)

# 0.校验wx_name是否符合基本要求（参考搜狗）

# 1.根据wx_name获取biz_list

# 获取之前需要清空全局attrs_list

global attrs_list

attrs_list = list()

wx_info_list = get_biz(wx_name)

logger.info('待入库微信公众号信息-%s' % wx_info_list)

for wx_info in wx_info_list:

wx_id = wx_info[1].split('\"||\"')[0]

biz = wx_info[0]

nick_name = wx_info[1].split('\"||\"')[1]

biz_id = base64.b64decode(biz)

now_time = datetime.datetime.now().strftime('%Y-%m-%d%H:%M:%S')

if wx_id == '':

wx_id = nick_name

logger.info('wx_id匹配为null，使用nick_name:%s替代' % (nick_name))

# 2.查询wx_name在表wx_author中是否存在

try:

WechatAuthor.get(biz=biz)

logger.info('该biz:%s已存在' % biz)

continue

except Exception:

try:

# 3.根据返回的biz_list，将相关数据入库wx_author

rs = WechatAuthor.create(wx_id=wx_id, biz=biz, nick_name=nick_name, create_time=now_time, last_modified_date=now_time, biz_id=biz_id)

logger.info('%s插入自增id:%s' % (nick_name, rs))

continue

except Exception as err:

logger.error('数据插入错误%s' % (err), exc_info=True)

continue

return statusResponse(R200_OK)

@app.route('/wechat/history/', methods=['PUT', 'GET'])

def wechat_history(nick_name):

logger.info("尝试获取%s该公众号历史文章" % (nick_name))

try:

wechat_author = WechatAuthor.get(nick_name=nick_name)

except Exception:

logger.info("并未获取%s该公众号基本信息" % (nick_name))

return statusResponse(R500_ERROR)

try:

if wechat_author.status == 0:

get_wx_content(wechat_author.biz)

else:

logger.info("已获取%s该公众号历史文章" % (nick_name))

except Exception as err:

logger.info("%s" % (err))

return statusResponse(R500_ERROR)

return statusResponse(R200_OK)

# WEB相关 END############################################################

if __name__ == '__main__':

app.run(host='0.0.0.0', port=8081, debug=True)

python 公众号爬虫_「Python」 - 微信公众号爬虫

相关阅读

相关文章

相关问答

相关文档