requests爬谷歌

司寇祖鹤
2023-12-01
import requests
from bs4 import BeautifulSoup
import json
import time
import csv

url='https://www.google.com/search'
params = {
    'safe': 'active',
    'sxsrf': 'ALeKk03BfKAgtDobSNR4uteXut6N__y38g:1611033698497',
    'ei': 'YmwGYLPyHdXh-AbPzKToCQ',
    'q': '',
    'gs_ssp': 'eJzj4tLP1TcwKzcqzjM1YPTizSotLsnMU0jKTE1KLQIAcHYIsw',
    'oq': 'justi',
    'gs_lcp': 'CgZwc3ktYWIQAxgAMg0ILhCxAxCDARBDEJMCMgoILhCxAxCDARBDMgQILhBDMgcILhCxAxBDMgcIABCxAxBDMgcILhCxAxBDMgQIABBDMggILhCxAxCDATICCAAyBQgAELEDOgUIABCRAjoICAAQsQMQgwE6BQguELEDUOqpBljaugZgjMsGaABwAngBgAGXBYgBvRKSAQkyLTEuMC4yLjKYAQCgAQGqAQdnd3Mtd2l6wAEB',
    'sclient': 'psy-ab',
    'start': '',
    'sa': 'N',
    'ved': '2ahUKEwjf-LW_t6fuAhWFd94KHXqXBo0Q8tMDegQIVxA2',
    'biw': '876',
    'bih': '900',
    'dpr': '1.5'
}

headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'en,zh-CN;q=0.9,zh;q=0.8',
    'cache-control': 'no-cache',
    'cookie': 'CGIC=IocBdGV4dC9odG1sLGFwcGxpY2F0aW9uL3hodG1sK3htbCxhcHBsaWNhdGlvbi94bWw7cT0wLjksaW1hZ2UvYXZpZixpbWFnZS93ZWJwLGltYWdlL2FwbmcsKi8qO3E9MC44LGFwcGxpY2F0aW9uL3NpZ25lZC1leGNoYW5nZTt2PWIzO3E9MC45; HSID=AtL1Fuks6GN88NgIF; SSID=A5G1V1zzIEPkl30hy; APISID=R62Dg908smzBc75_/AE6wtPR0lHi3PNcyr; SAPISID=DhY-UzCNe22FLTow/AjmaKK4_-XFREXtVX; __Secure-3PAPISID=DhY-UzCNe22FLTow/AjmaKK4_-XFREXtVX; SID=5AcO7y9MYqK1JBYbOB9T7xeWcJwKQjOtRbbCh60AFgdUo6QyiT-wrT0furAG-H4tCk87hA.; __Secure-3PSID=5AcO7y9MYqK1JBYbOB9T7xeWcJwKQjOtRbbCh60AFgdUo6Qyoq7YhstvGbLeLKfl7S3HHw.; OTZ=5796681_24_24__24_; SEARCH_SAMESITE=CgQI0ZEB; ANID=AHWqTUnyp6Ge1xtQ_TL0NAZmTvouupte3kUSnVW6oKAZd5CJZEL6eTEKr8Dvuy3J; 1P_JAR=2021-01-19-03; NID=207=QJ3H1_PEEqH87e2HJ9-LYqdl8T4kq3B7Ybxa6cnWTvf6FOu5kuFPIMN6sjPTa6uGQQgd_ILLwdgrFexdNcG1edmOLPwamevgB8wWBZTt8zDQ0C1qGnoO-0HT4-DD2bHyIv4mYZhQXkQwYYY1YlC1woUo4hIJbB6fI9shEBm_UIAElqTHRWjiudjgWc_VM69_cCacl4muQHElSfs-ok-7L6w0kp8-3pl6A8YVtdhoev3ms0LXWcOyTpmRP9vhrHNKBuZpSeRAuCGLI2PLSqtQspyzHMf3A18sqZIUM2hCgIhExcjR1UKwRrn7ikGllVwFqFSJDXAD9A; SIDCC=AJi4QfEXFZmOSCUgZxKqFr4vUETalX_wQAJOEnDTUtPaFDY2oUOdAha_UIyodIM1esXg72G-8w; __Secure-3PSIDCC=AJi4QfHq0q1H0qhjNe1pbyTuSEzm0nH3jNZD2QTrv-4yP_7QVwZ28wEHI0d7YL1SGNRlMlF1ow',
    'pragma': 'no-cache',
    'referer': 'https://www.google.com/',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
    'x-client-data': 'CIi2yQEIpLbJAQjBtskBCKmdygEIx8LKAQisx8oBCLTLygEIpM3KAQjAz8oBCNzVygEIlJrLAQjNmssBCNScywEIqZ3LAQiqncsBCK6dywEY+bjKARiqm8sB' 
    }

#    'accept-encoding': 'gzip, deflate, br',

params['q'] = 'justin bieber'
results = []
for i in range(0,5):
    params['start'] = str(i * 10)
    response = requests.get(url,params=params,headers=headers)
    print('Get Response from : %s | Status Code : %s' %(response.url, response.status_code))
    content = BeautifulSoup(response.text, "lxml")

    blocks = content.findAll('div',{'class','tF2Cxc'})
    for block in blocks:
        item = {
            'Title' : block.find('h3',{'class','LC20lb DKV0Md'}).text.encode('ascii', 'ignore'),
            'Link' : block.find('div',{'class','yuRUbf'}).find('a')['href'],
            'Description' : block.find('span',{'class','aCOpRe'}).text.encode('ascii', 'ignore')
        }
        #print(json.dumps(item,indent=2))
        results.append(item)
    
    time.sleep(3)

with open('JBscrape.csv','w',newline='') as csv_file:
    writer = csv.DictWriter(csv_file, results[0].keys())
    writer.writeheader()
    for row in results:
        writer.writerow(row)
    
    print('Exported results to "JBscrape.csv" file')

 类似资料: