python正则表达式的简单测试

单于山
2023-12-01

用于简单的测试正则表达式的正确性,顺便爬得一些信息

from urllib import request
import re
import time


class Read_Msg():
    def __init__(self, url, regular, sign):
        self.url = url
        self.regular = regular
        self.head = {}
        self.head[
            'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
        self.model = re.compile(self.regular, re.M)
        self.sign = sign

    def gethtml(self):
        req = request.Request(self.url, headers=self.head)
        data = request.urlopen(req).read()
        html = data.decode('utf-8')
        msg = self.model.findall(html)
        return msg

    def printmsg(self):
        print("\n%s\n" % self.sign.join(i for i in self.gethtml()))


def main():
    print("欢迎使用网页信息提取器!本提取器可用于贴吧,论坛等信息的快捷爬取及正则的测试\n")
    url = input("输入目标网址(若提取多页需带删除页数后缀,单页提取直接复制完整url):\n如 http://tieba.baidu.com/p/5903613724?pn=\n")
    url_end = input("输入网址页码的后缀(*只有当多页提取,页数在网址的中间部分改变时才需输入!)")
    sign = input("输入信息分割(如;_./空格等符号):")
    while True:
        try:
            times = input("输入提取时间间隔/秒(反爬),不输入直接回车")
            if times == '':
                times = 0
            else:
                times = int(times)
            break
        except:
            print("时间间隔必须为数字")

    while True:
        regular = input("输入正则表达式:常见正则如:\n"
                        "提取QQ邮箱: [1-9][0-9]{5,11}@qq.com\n"
                        "提取163邮箱: [1-9a-zA-Z][0-9a-zA-Z]{3,17}@163.com\n"
                        "提取爱奇艺电影列表: title=\"(\w{1,20})\" rseat=\"  或 target=\"_blank\">(\w{1,20})</a> \n"
                        "提取图片url: src=\"(https{0,1}.*?\.[jp][pn][g])\"[\s|>][daswpctz]{0,1}\n")  # 正则表达式
        page_start = int(input("输入起始翻页后缀:\n(如http://tieba.baidu.com/p/5579186130?pn=5 作起始页,则输入5)\n"))
        page_end = int(input("输入结束翻页后缀:\n如http://tieba.baidu.com/p/5579186130?pn=120 作终止页,则输入120,单页输1\n"))
        page = int(input("输入每页之间的间隔数(如每页之间间隔20为新的一页就输20,单页提取或每页间隔1则输入1:\n"))
        page_endnum = int(page_start * page_end)
        for num in range(page_start, page_endnum + 1, page):  # num 从第1页开始累加,依次每一页
            if page_endnum > 1:
                url_num = url + str(num) + url_end  # 改变页面的只有后缀的数字
            else:
                url_num = url
            try:
                P = Read_Msg(url_num, regular, sign)  # 传递参数
                P.printmsg()  # 调用方法并取得返回值累加
                time.sleep(times)
            except:
                print("提取进程发生异常!")
        print("\n已完成所有信息提取!\n当前目标url: %s\n当前使用正则:%s" % (url, regular))
        url_change = input("更换目标url请输入NEW,按其他键使用当前url并继续\n")
        if url_change == 'NEW':
            url = input("输入新目标url:")
            url_end = input("输入新的网址翻页后缀(多页提取时页数在网址的中间部分改变时才需输入!)")


if __name__ == '__main__':
    main()

 

 类似资料: