python3 编写Url采集器（Google，Baidu，Bing ）

阳狐若

2023-12-01

简单的实现了三个搜索引擎的Url采集。代码写的不够简洁，复用率低，大佬别笑。

考虑到人机验证就不加入多线程模块，慢慢跑也好过被拦截（菜，人机验证绕不过）。

百度的url需要进行二次访问才能抓取到原始的URL。

import requests,re
from urllib import parse
from time import sleep,time

'''URL采集器Author:Char0n'''

header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0'}

def google(keyword,pagenumber):
    time_stamp = str(time())
    try:
        for page in range(0,(pagenumber-1)*10+1,10):
            google_url = 'https://www.google.com/search?q={}&start={}'.format(keyword,page)
            html = requests.get(google_url)
            patt = re.compile('"><a href="/url\?q=(.*?)&amp;sa=')
            url_list = re.findall(patt,html.text)
            ext(url_list,time_stamp)
            sleep(5)
    except requests.exceptions.ProxyError:
        print('请求被拦截，可能需要进行人机验证')

def baidu(keyword,pagenumber):
    time_stamp = str(time())
    try:
        for page in range(0,(pagenumber-1)*10+1,10):
            baidu_url = 'https://www.baidu.com/s?wd={}&pn={}'.format(keyword,page)
            html = requests.get(baidu_url,headers=header)
            patt = re.compile('"><a target="_blank" href="(.*?)" class="')
            url_list = re.findall(patt,html.text)
            baidu_ext(url_list,time_stamp)
            sleep(5)
    except requests.exceptions.ProxyError:
        print('请求被拦截，可能需要进行人机验证')

def bing(keyword,pagenumber):
    time_stamp = str(time())
    try:
        for page in range(1,pagenumber*10+1,10):
            bing_url = 'https://cn.bing.com/search?q={}&go=%e6%90%9c%e7%b4%a2&qs=ds&mkt=zh-CN&first={}&FORM=PERE'.format(keyword,page)
            html = requests.get(bing_url,headers=header)
            patt = re.compile('<a target="_blank" href="(.*?)" h=')
            url_list = re.findall(patt,html.text)
            ext(url_list,time_stamp)
            sleep(5)
    except requests.exceptions.ProxyError:
        print('请求被拦截，可能需要进行人机验证')

def ext(url_list,time_stamp):
    try:
        for url in url_list:
            url = parse.unquote(url)
            with open('{}.txt'.format(time_stamp),'a') as f:
                f.write(url+'\n')
                print('[+]'+url)
    except:
        print('请求超时')

#百度采集的URL需要经过二次访问才能获得原ip
def baidu_ext(url_list,time_stamp):
    try:
        for url in url_list:
            again = requests.get(url,headers=header,timeout=5.0)
            with open('{}.txt'.format(time_stamp),'a') as f:
                f.write(again.url+'\n')
                print('[+]'+again.url)
                sleep(1)
    except:
        print('请求超时')

def main(Search_engine,Search_key,Search_page):
    if Search_engine == 1:
        google(Search_key,Search_page)
    elif Search_engine == 2:
        bing(Search_key,Search_page)
    elif Search_engine == 3:
        baidu(Search_key,Search_page)
    else:
        print('请输入正确的编号')

if __name__ == '__main__':
    print('''Instructions：URL采集，请求超时可能是触发了搜索引擎的人机验证。''')
    Search_engine = input('请输入调用的搜索引擎编号[1 : Google][2 : Bing][3 : BaiDu]: ')
    Search_key = input('请输入关键词: ')
    Search_page = input('请输采集页数(10条/1页): ')
    main(int(Search_engine),Search_key,int(Search_page))

python3 编写Url采集器（Google，Baidu，Bing ）

相关阅读

相关文章

相关问答

相关文档