13. 网络爬虫案例实战1
优质
小牛编辑
134浏览
2023-12-01
- 本次案例是通过登录人人网,抓取登录后用户中心的信息
1. 模拟人人登录请求,执行登录验证操作
from urllib import request,parse
login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018321648829'
data = {
'email':'1352*****6',
'icode':'',
'origURL':'http://www.renren.com/home',
'domain':'renren.com',
'key_id':'1',
'captcha_type':'web_login',
'password':'478b7c2dca554eeabed3b7374703bff4a6a22e78b8a9fcfb090e3a7fb792992b',
'rkey':'e954ec64a7ecf4e33bdf81bb1abad158',
'f':'http%3A%2F%2Fwww.renren.com%2F965541786',
}
data = parse.urlencode(data)
headers = {
'Content-Length' : len(data)
}
req = request.Request(url=login_url,data=bytes(data,encoding='utf-8'),headers=headers,)
response = request.urlopen(req)
print(response.read().decode('utf-8'))
2. 抓取登录成功后的用户home页信息
from urllib import request
import re,gzip
base_url = 'http://www.renren.com/965541786'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'anonymid=jgdcqcjgqy4yxt; depovince=BJ; _r01_=1; JSESSIONID=abc7HUv9M_HsB7WkgK2lw; ick_login=b954cf62-bbe5-480d-b679-e1e3ce584896; SL_GWPT_Show_Hide_tmp=1; SL_wptGlobTipTmp=1; ick=b4337770-b7ce-4a70-b9d0-cd63c7fc7bb5; XNESSESSIONID=738f4bde312f; jebe_key=f950add1-40e8-4009-a157-bfc3d89f7350%7C24a48cb369f8637c5ee2c4a23eb5b93f%7C1524555370510%7C1%7C1524555375485; first_login_flag=1; ln_uact=13520319616; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; wp_fold=0; jebecookies=ef7f7372-0e70-45db-aaae-c415d4611918|||||; _de=8C2F648D7158ED727318288C8F3F21C5; p=f1ea4b6984cefb7d88164a67816c91fe6; t=401516286d37bde6735180d25f68f2fe6; societyguester=401516286d37bde6735180d25f68f2fe6; id=965541786; xnsid=928c27b; ver=7.0; loginfrom=null',
'Host': 'www.renren.com',
'Referer': 'http://www.renren.com/SysHome.do',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
}
req = request.Request(url=base_url,headers=headers)
res = request.urlopen(req)
#html = res.read().decode("utf-8") # 网页响应时开启了gzip压缩,需要解压
#报错:UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
# 获取将请求头中的Accept-Encoding的gzip删除也可以
# 对gzip压缩的响应,我们解压后转码
html = gzip.decompress(res.read()).decode("utf-8")
#print(html)
print(re.findall("<title>(.*?)</title>",html))
3. 使用cookiejar将上面两个合并到一起执行
from urllib import request,parse
import re,gzip,time
# cookie管理模块,
from http import cookiejar
# 返回存储cookie对象
cookie = cookiejar.CookieJar()
# 返回一个cookie管理器
cookie_handler = request.HTTPCookieProcessor(cookie)
# 请求管理器
opener = request.build_opener(cookie_handler)
def doLogin():
login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018321648829'
data = {
'email':'1352*****16',
'icode':'',
'origURL':'http://www.renren.com/home',
'domain':'renren.com',
'key_id':'1',
'captcha_type':'web_login',
'password':'478b7c2dca554eeabed3b7374703bff4a6a22e78b8a9fcfb090e3a7fb792992b',
'rkey':'e954ec64a7ecf4e33bdf81bb1abad158',
'f':'http%3A%2F%2Fwww.renren.com%2F965541786',
}
data = parse.urlencode(data)
headers = {
'Content-Length' : len(data)
}
req = request.Request(url=login_url,data=bytes(data,encoding='utf-8'),headers=headers,)
response = opener.open(req)
def myHome():
home_url = 'http://www.renren.com/965541786'
res = opener.open(home_url)
html = res.read().decode("utf-8")
#print(html)
print(re.findall("<title>(.*?)</title>",html))
if __name__ == '__main__':
# 登陆
print("正在登录中...")
doLogin()
time.sleep(3)
# 访问个人首页
myHome()
4. 使用requests重写第三步的代码,实现人人网登录并抓取登录后信息
import requests
import re,time
s = requests.Session()
def doLogin():
login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018321648829'
data = {
'email':'1352*****6',
'icode':'',
'origURL':'http://www.renren.com/home',
'domain':'renren.com',
'key_id':'1',
'captcha_type':'web_login',
'password':'478b7c2dca554eeabed3b7374703bff4a6a22e78b8a9fcfb090e3a7fb792992b',
'rkey':'e954ec64a7ecf4e33bdf81bb1abad158',
'f':'http%3A%2F%2Fwww.renren.com%2F965541786',
}
s.post(login_url,data=data)
def myHome():
home_url = 'http://www.renren.com/965541786'
res = s.get(home_url)
html = res.content.decode("utf-8")
#print(html)
print(re.findall("<title>(.*?)</title>",html))
if __name__ == '__main__':
# 登陆
print("正在登录中...")
doLogin()
time.sleep(3)
# 访问个人首页
myHome()