#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import time
import datetime
import random
import json
import base64
import re
import requests
import urllib3
import logging.handlers
from flask import Flask, request
from flask import jsonify
from html.parser import HTMLParser
from peewee import * # pylint: disable=unused-import
# 禁用requests提交https请求时的报错
urllib3.disable_warnings()
# LOG相关 START#############################################################
rq = time.strftime('%Y-%m-%d', time.localtime(time.time()))
LOG_FILE = 'info.%s.log' % (rq)
handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes=1024 * 1024 * 10, backupCount=5, encoding='utf-8') # 实例化handler
fmt = '[%(asctime)s] [%(funcName)s:%(filename)s:%(lineno)d]%(levelname)s-%(message)s'
formatter = logging.Formatter(fmt)
handler.setFormatter(formatter)
logger = logging.getLogger('info')
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
# LOG相关 END#############################################################
# DB相关 START#############################################################
db = MySQLDatabase("wechat_db", host="192.168.47.249", port=3306, user="root", passwd="root")
db.connect()
class BaseModel(Model):
class Meta:
database = db
class WechatAuthor(BaseModel):
id = PrimaryKeyField()
wx_id = CharField(null=False)
biz = CharField(null=False, unique=True)
nick_name = CharField(null=True)
status = IntegerField(null=True, default=0)
biz_id = BigIntegerField(null=True)
create_time = DateTimeField(null=True)
last_modified_date = DateTimeField(null=True)
class Meta:
db_table = 'wx_author'
class WechatContent(BaseModel):
id = PrimaryKeyField()
biz = CharField(null=False, index=True)
author = CharField(null=True)
title = CharField(null=False)
digest = CharField(null=True)
content_url = CharField(null=False)
source_url = CharField(null=True)
push_time = DateTimeField(null=True)
read_num = IntegerField(null=True)
like_num = IntegerField(null=True)
reward_num = IntegerField(null=True)
create_time = DateTimeField(null=True)
last_modified_date= DateTimeField(null=True)
class Meta:
db_table = 'wx_content'
# DB相关 END#############################################################
# HTTP相关 START#############################################################
attrs_list = list()
class MyHTMLParser(HTMLParser):
# 筛选标签,属性key包含uigs、account_name,其中href就是公众号url
# 一个页面可能包含多个公众号信息
def handle_starttag(self, tag, attrs):
if tag == 'a':
for attr in attrs:
if attr[0] == 'uigs' and 'account_name' in attr[1]:
attrs_list.append(attrs)
# wechat_name可以是wx_id也可以为nick_name
def get_sogou_html(wechat_name):
# 通过搜狗发送get请求
header = {'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
sogou_url = 'http://weixin.sogou.com/weixin?query=' + wechat_name
response = requests.get(sogou_url, headers=header, verify=False)
logger.info('获取搜狗查询页面:%s-状态:%s' % (wechat_name, response.status_code))
return response.text
# 公众号url有访问频率限制,访问太快会触发验证码
def get_wechat_html(wechat_url):
header = {'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
response = requests.get(wechat_url, headers=header, verify=False)
logger.info('获取微信公众号页面:%s-状态:%s' % (wechat_url, response.status_code))
# todo 处理验证码页面
return response.text
def get_wxurl_list(html):
parser = MyHTMLParser()
parser.feed(html)
wxurl_list = [t[2][1] for t in attrs_list]
return wxurl_list
def get_biz(wechat_name):
wx_info_list = list()
# 这里没有处理搜狗分页
html = get_sogou_html(wechat_name)
wxurl_list = get_wxurl_list(html)
for wechat_url in wxurl_list:
html = get_wechat_html(wechat_url)
# 如果获取微信html,触发验证码,这里不应该正则匹配
# 在js代码中,有biz,还有wx_id及nick_name
# wx_id可能为''
biz = re.findall(r'var biz = "(\S*)"', html)
nick_name = re.findall(r'var name="(\S*)"', html)
wx_info_list.append((biz[0], nick_name[0]))
logging.info('biz:%snick_name:%s' % (biz, nick_name))
time.sleep(random.randint(5, 20))
return wx_info_list
def headers_to_dict(headers):
"""将字符串'''Host: mp.weixin.qq.comConnection: keep-aliveCache-Control: max-age='''转换成字典类型:param headers: str:return: dict"""
headers = headers.split("\n")
d_headers = dict()
for h in headers:
h = h.strip()
if h:
k, v = h.split(":", 1)
d_headers[k] = v.strip()
return d_headers
def get_wx_content(biz):
# url需要实时抓APP数据包获得,其中几个value经常改变
url = 'https://mp.weixin.qq.com/mp/profile_ext?x5=0&is_ok=1&action=getmsg&scene=126&uin=777&key=777&f=json&count=10&' \
'__biz={}&offset={}&' \
'pass_ticket={}&' \
'appmsg_token={}'
# Cookie需要实时抓APP数据包获得
headers = {
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Mobile/14G60 MicroMessenger/6.7.0 NetType/WIFI Language/zh_CN',
'Cookie': ''}
next_offset = 0
while True:
content_url = url.format(biz, next_offset)
response = requests.get(content_url, headers=headers, verify=False)
logger.info('content_url:%s- 状态:%s' % (content_url, response.status_code))
jsonstr = response.text
rs = json.loads(jsonstr)
# 标注是否有后续分页
can_msg_continue = rs['can_msg_continue']
next_offset = rs['next_offset']
general_msg_list = rs['general_msg_list']
rs = json.loads(general_msg_list)
for obj in rs['list']:
author = obj['app_msg_ext_info']['author']
title = obj['app_msg_ext_info']['title']
digest = obj['app_msg_ext_info']['digest']
content_url = obj['app_msg_ext_info']['content_url']
source_url = obj['app_msg_ext_info']['source_url']
now_time = datetime.datetime.now().strftime('%Y-%m-%d%H:%M:%S')
push_time = datetime.datetime.fromtimestamp(int(obj['comm_msg_info']['datetime'])).strftime('%Y-%m-%d%H:%M:%S')
WechatContent.create(biz=biz, author=author, title=title, digest=digest, content_url=content_url, source_url=source_url, push_time=push_time, read_num=0, like_num=0, reward_num=0, create_time=now_time, last_modified_date=now_time)
logger.info('%s-%s入库' % (title, author))
if obj['app_msg_ext_info']['is_multi'] == 1:
for sub_obj in obj['app_msg_ext_info']['multi_app_msg_item_list']:
author = sub_obj['author'],
title = sub_obj['title'],
digest = sub_obj['digest'],
content_url = sub_obj['content_url'],
source_url = sub_obj['source_url'],
WechatContent.create(biz=biz, author=author, title=title, digest=digest, content_url=content_url, source_url=source_url, push_time=push_time, read_num=0, like_num=0, reward_num=0, create_time=now_time, last_modified_date=now_time)
logger.info('%s-%s入库' % (title, author))
if can_msg_continue == 0:
logger.info('%s历史文章获取入库完毕' % (biz))
break
time.sleep(random.randint(5, 20))
# HTTP相关 END#############################################################
# WEB相关 START############################################################
# 定义返回码
R200_OK = {'code': 200, 'message': 'OK'}
R500_ERROR = {'code': 500, 'message': 'ERROR'}
def statusResponse(statu_dic):
return jsonify({'status': statu_dic})
def fullResponse(statu_dic, data):
return jsonify({'status': statu_dic, 'data': data})
app = Flask(__name__)
# 使用rest风格接收HTTP传递的wx_name
@app.route('/wechat/', methods=['PUT', 'GET'])
def wechat_content(wx_name):
logger.info('获取wechat_name=%s相关信息开始' % wx_name)
# 0.校验wx_name是否符合基本要求(参考搜狗)
# 1.根据wx_name获取biz_list
# 获取之前需要清空全局attrs_list
global attrs_list
attrs_list = list()
wx_info_list = get_biz(wx_name)
logger.info('待入库微信公众号信息-%s' % wx_info_list)
for wx_info in wx_info_list:
wx_id = wx_info[1].split('\"||\"')[0]
biz = wx_info[0]
nick_name = wx_info[1].split('\"||\"')[1]
biz_id = base64.b64decode(biz)
now_time = datetime.datetime.now().strftime('%Y-%m-%d%H:%M:%S')
if wx_id == '':
wx_id = nick_name
logger.info('wx_id匹配为null,使用nick_name:%s替代' % (nick_name))
# 2.查询wx_name在表wx_author中是否存在
try:
WechatAuthor.get(biz=biz)
logger.info('该biz:%s已存在' % biz)
continue
except Exception:
try:
# 3.根据返回的biz_list,将相关数据入库wx_author
rs = WechatAuthor.create(wx_id=wx_id, biz=biz, nick_name=nick_name, create_time=now_time, last_modified_date=now_time, biz_id=biz_id)
logger.info('%s插入自增id:%s' % (nick_name, rs))
continue
except Exception as err:
logger.error('数据插入错误%s' % (err), exc_info=True)
continue
return statusResponse(R200_OK)
@app.route('/wechat/history/', methods=['PUT', 'GET'])
def wechat_history(nick_name):
logger.info("尝试获取%s该公众号历史文章" % (nick_name))
try:
wechat_author = WechatAuthor.get(nick_name=nick_name)
except Exception:
logger.info("并未获取%s该公众号基本信息" % (nick_name))
return statusResponse(R500_ERROR)
try:
if wechat_author.status == 0:
get_wx_content(wechat_author.biz)
else:
logger.info("已获取%s该公众号历史文章" % (nick_name))
except Exception as err:
logger.info("%s" % (err))
return statusResponse(R500_ERROR)
return statusResponse(R200_OK)
# WEB相关 END############################################################
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8081, debug=True)