from urllib import request
import re
import time
class Read_Msg():
def __init__(self, url, regular, sign):
self.url = url
self.regular = regular
self.head = {}
self.head[
'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
self.model = re.compile(self.regular, re.M)
self.sign = sign
def gethtml(self):
req = request.Request(self.url, headers=self.head)
data = request.urlopen(req).read()
html = data.decode('utf-8')
msg = self.model.findall(html)
return msg
def printmsg(self):
print("\n%s\n" % self.sign.join(i for i in self.gethtml()))
def main():
print("欢迎使用网页信息提取器!本提取器可用于贴吧,论坛等信息的快捷爬取及正则的测试\n")
url = input("输入目标网址(若提取多页需带删除页数后缀,单页提取直接复制完整url):\n如 http://tieba.baidu.com/p/5903613724?pn=\n")
url_end = input("输入网址页码的后缀(*只有当多页提取,页数在网址的中间部分改变时才需输入!)")
sign = input("输入信息分割(如;_./空格等符号):")
while True:
try:
times = input("输入提取时间间隔/秒(反爬),不输入直接回车")
if times == '':
times = 0
else:
times = int(times)
break
except:
print("时间间隔必须为数字")
while True:
regular = input("输入正则表达式:常见正则如:\n"
"提取QQ邮箱: [1-9][0-9]{5,11}@qq.com\n"
"提取163邮箱: [1-9a-zA-Z][0-9a-zA-Z]{3,17}@163.com\n"
"提取爱奇艺电影列表: title=\"(\w{1,20})\" rseat=\" 或 target=\"_blank\">(\w{1,20})</a> \n"
"提取图片url: src=\"(https{0,1}.*?\.[jp][pn][g])\"[\s|>][daswpctz]{0,1}\n") # 正则表达式
page_start = int(input("输入起始翻页后缀:\n(如http://tieba.baidu.com/p/5579186130?pn=5 作起始页,则输入5)\n"))
page_end = int(input("输入结束翻页后缀:\n如http://tieba.baidu.com/p/5579186130?pn=120 作终止页,则输入120,单页输1\n"))
page = int(input("输入每页之间的间隔数(如每页之间间隔20为新的一页就输20,单页提取或每页间隔1则输入1:\n"))
page_endnum = int(page_start * page_end)
for num in range(page_start, page_endnum + 1, page): # num 从第1页开始累加,依次每一页
if page_endnum > 1:
url_num = url + str(num) + url_end # 改变页面的只有后缀的数字
else:
url_num = url
try:
P = Read_Msg(url_num, regular, sign) # 传递参数
P.printmsg() # 调用方法并取得返回值累加
time.sleep(times)
except:
print("提取进程发生异常!")
print("\n已完成所有信息提取!\n当前目标url: %s\n当前使用正则:%s" % (url, regular))
url_change = input("更换目标url请输入NEW,按其他键使用当前url并继续\n")
if url_change == 'NEW':
url = input("输入新目标url:")
url_end = input("输入新的网址翻页后缀(多页提取时页数在网址的中间部分改变时才需输入!)")
if __name__ == '__main__':
main()