scrapy爬虫注意点(1)—— scrapy.FormRequest中formdata参数
scrapy爬虫注意点(1)—— scrapy.FormRequest中formdata参数
–记一次严重的错误:
在爬取艺龙网站的酒店信息时,遇到需要向服务器发送post请求的,用scrapy.Request怎么也请求不出来,
但是requests模块就可以请求出来。
解决办法是:我把 scrapy.Request 换成了 scrapy.FormRequest 就可以请求出来数据了,
yield scrapy.FormRequest(
url=url,
formdata=self.data(xing_zheng_qu_id, pin_pai_id, fu_wu_id, page),
# 表单数据,字典格式,注意数字也要用引号引起来,否则报错。
headers=self.headers,
callback=self.parse_a,
dont_filter=True,
meta={'xing_zheng_qu_name':xing_zheng_qu_name}
)
yield scrapy.Request(
url=key,
method="POST",
body=json.dumps(self.data(value)),
headers=self.headers,
callback=self.parse_a,
dont_filter=True
)
r = requests.post(url=url, data=self.data(xing_zheng_qu_id, pin_pai_id, fu_wu_id, page), headers=self.headers)
self.headers如下:
headers = {'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
#'Connection': 'keep-alive',
#'Content-Length': '2271',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
#'Content-Type': 'application/json;charset=UTF-8',
'Referer': 'http://hotel.elong.com/search/list_cn_0101.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
完整代码如下:
# -*- coding: utf-8 -*-
"""
https://pengshiyu.blog.csdn.net/article/details/83859158
# scrapy.Request发送请求报400的错,但换成request模块就请求成功,解决如下:
https://blog.csdn.net/weixin_44505901/article/details/109639910
"""
import pymysql
import requests
import scrapy
import json
import time
import re
import random
# from lxml import etree
from yi_long.items import YiLongItem
class YiLongSpiderSpider(scrapy.Spider):
name = 'yi_long_spider'
allowed_domains = ['elong.com']
start_urls = ['http://hotel.elong.com/search/list_cn_0101.html']
headers = {'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
#'Connection': 'keep-alive',
#'Content-Length': '2271',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
#'Content-Type': 'application/json;charset=UTF-8',
'Referer': 'http://hotel.elong.com/search/list_cn_0101.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def data(self,xing_zheng_qu_id, pin_pai_id, fu_wu_id,page):
data = {'code': '7173387',
'listRequest.areaID': '',
'listRequest.bedLargeTypes': '',
'listRequest.bookingChannel': '1',
'listRequest.breakfasts': '0',
'listRequest.cancelFree': 'false',
'listRequest.cardNo': '192928',
'listRequest.checkInDate': '2020-11-06 00:00:00',
'listRequest.checkOutDate': '2020-11-07 00:00:00',
'listRequest.cityID': '0101',
'listRequest.cityName': '北京',
'listRequest.crawledFlag': '0',
'listRequest.customLevel': '11',
'listRequest.discountIds': '',
'listRequest.distance': '20000',
'listRequest.endLat': '0',
'listRequest.endLng': '0',
'listRequest.epcCreateOrderGuideVersion': 'Z',
'listRequest.facilityIds': str(fu_wu_id), # 服务
'listRequest.guokaoFlag': 'false',
'listRequest.highPrice': '0',
'listRequest.hotelBrandIDs': str(pin_pai_id), # 酒店品牌
'listRequest.hotelIDs': '',
'listRequest.interceptAction': '0',
'listRequest.isAdvanceSave': 'false',
'listRequest.isAfterCouponPrice': 'true',
'listRequest.isCoupon': 'false',
'listRequest.isDebug': 'false',
'listRequest.isLimitTime': 'false',
'listRequest.isLogin': 'false',
'listRequest.isMobileOnly': 'true',
'listRequest.isNeed5Discount': 'true',
'listRequest.isNeedNotContractedHotel': 'false',
'listRequest.isNeedSimilarPrice': 'false',
'listRequest.isReturnNoRoomHotel': 'true',
'listRequest.isStaySave': 'false',
'listRequest.isTrace': 'false',
'listRequest.isUnionSite': 'false',
'listRequest.isnstantConfirm': 'false',
'listRequest.keywords': '',
'listRequest.keywordsType': '0',
'listRequest.language': 'cn',
'listRequest.lat': '39.9059093',
'listRequest.listType': '0',
'listRequest.lng': '116.3913489',
'listRequest.lowPrice': '0',
'listRequest.orderFromID': '50',
'listRequest.pageIndex': str(page), # 页数
'listRequest.pageSize': '20',
'listRequest.payMethod': '0',
'listRequest.personOfRoom': '0',
'listRequest.poiId': '0',
'listRequest.poiName': '',
'listRequest.productTypes': '1,6,26',
'listRequest.promotionChannelCode': '0000',
'listRequest.promotionSwitch': '-1',
'listRequest.proxyID': 'ZD',
'listRequest.rankType': '0',
'listRequest.returnFilterItem': 'true',
'listRequest.sectionId': str(xing_zheng_qu_id), # 行政区
'listRequest.sellChannel': '1',
'listRequest.seoHotelStar': '0',
'listRequest.sortDirection': '1',
'listRequest.sortMethod': '1',
'listRequest.standBack': '-1',
'listRequest.starLevels': '',
'listRequest.startLat': '0',
'listRequest.startLng': '0',
'listRequest.sug_act_info': '',
'listRequest.taRecommend': 'false',
'listRequest.themeIds': '',
'listRequest.traceId': 'a1bf3fc0-2ec3-48d2-bf59-bc42c5d87a3b',
'listRequest.wordId': '',
# 'listRequest.wordType': '-1',
# 'listRequest.elongToken': '738efb88-e7f9-49e3-8bd0-012688528533',
# 'listRequest.trace_token': '|*|cityId:101|*|qId:4f2f15bc-1f21-443e-be0a-98408557f583|*|st:city|*|sId:101|*|'
}
return data
def start_requests(self):
for i in self.start_urls:
yield scrapy.Request(url=i,callback=self.parse)
def parse(self, response):
data1=response.xpath("//div[@id='filterZone']/div[@class='filter_item z10']/div[@class='filter_option_box']/div[@class='filter_option']/div[@class='filter_posi_show']/ul[@data-typeid='4']/li[@method='location']")
for i in data1:
xing_zheng_qu_name=i.xpath("./@title").get() # 行政区
xing_zheng_qu_id = i.xpath("./@data-id").get() # 行政区id
if xing_zheng_qu_id is None:
xing_zheng_qu_id=1010024
data3=response.xpath("//div[@data-typeid='3']/div[@class='filter_option_box']/div/ul[@class='filter_cb_list']/li")
for j in data3:
pin_pai_name=j.xpath("./@data-name").get() # 品牌名称
pin_pai_id=j.xpath("./@data-id").get() # 品牌ID
if pin_pai_id is None:
pin_pai_id = 32
data4=response.xpath("//div[@data-multi='1']/div[@class='filter_option_box']/div/ul/li[@data-typeid='1011']")
for k in data4:
fu_wu_name=k.xpath("./@data-name").get() # 服务名称
fu_wu_id=k.xpath("./@data-id").get() # 服务id
if fu_wu_id is None:
fu_wu_id =100000205
page=1
url='http://hotel.elong.com/ajax/tmapilist/asyncsearch'
time.sleep(random.randint(2,3))
yield scrapy.FormRequest(
url=url,
formdata=self.data(xing_zheng_qu_id, pin_pai_id, fu_wu_id, page),
# # 表单数据,字典格式,注意数字也要用引号引起来,否则报错。
headers=self.headers,
callback=self.parse_a,
dont_filter=True,
meta={'xing_zheng_qu_name':xing_zheng_qu_name,'pin_pai_name':pin_pai_name,'fu_wu_name':fu_wu_name}
)
# **********************************************************************
# 注意:这里用scrapy.Request请求不出来,
# 要换成scrapy.FormRequest 才可以请求出来
# yield scrapy.Request(
# url=key,
# method="POST",
# body=json.dumps(self.data(value)),
#
# headers=self.headers,
# callback=self.parse_a,
# dont_filter=True
# )
# ***************************************************************************
# r = requests.post(url=url, data=self.data(xing_zheng_qu_id, pin_pai_id, fu_wu_id, page), headers=self.headers)
# print(self.data(xing_zheng_qu_id, pin_pai_id, fu_wu_id, page))
# print(r.text)
#self.parse_a(r,xing_zheng_qu_name,pin_pai_name,fu_wu_name)
def parse_a(self, response): # ,xing_zheng_qu_name,pin_pai_name,fu_wu_name
xing_zheng_qu_name=response.meta['xing_zheng_qu_name']
pin_pai_name = response.meta['pin_pai_name']
fu_wu_name = response.meta['fu_wu_name']
data = json.loads(response.text)
data1=data.get('value').get('hotelListHtml')
for i in data1.split('<!--------------------------psgRecommend End------------------------------------------->')[:-1]:
try:
name = re.findall(r'<a.href="javascript:void\(0\);".title="(.*)">', i)[0]
except:
name=None
try:
jie_ge=re.findall(r'<span.class="h_pri_num ">(.*)</span>',i)[0]
except:
jie_ge = 0
if jie_ge:
#print(jie_ge)
pass
else:
jie_ge=0
try:
ping_fen=re.findall(r'<i class="t20 c37e">(.*)</i>',i)[0]
ping_fen=''.join(ping_fen).strip(' ')
except:
ping_fen=None
try:
dian_ping=re.findall(r'共<b>(.*)</b>条点评',i)[0]
except:
dian_ping=None
try:
di_qu=re.findall(r'\[(.*)\]',i)[0]
except:
di_qu = None
try:
hotelid=re.findall(r'method="hotelItem".data-hotelid="(\d*)".',i)[0]
except:
hotelid = None
xing_ji=random.randint(3,5)
url='http://hotel.elong.com/'+str(hotelid)+'/?issugtrace=2'
# print(url)
yield scrapy.Request(url=url, callback=self.parse_b,
meta={'name': name, 'jie_ge': jie_ge, 'ping_fen': ping_fen, 'dian_ping': dian_ping,
'di_qu': di_qu,'hotelid':hotelid,'xing_ji':xing_ji,'xing_zheng_qu_name':xing_zheng_qu_name,
'pin_pai_name':pin_pai_name,'fu_wu_name':fu_wu_name},headers=self.headers)
def parse_b(self,response):
name=response.meta['name']
jie_ge=response.meta['jie_ge']
ping_fen=response.meta['ping_fen']
dian_ping=response.meta['dian_ping']
di_qu=response.meta['di_qu']
hotelid=response.meta['hotelid']
xing_ji=response.meta['xing_ji']
xing_zheng_qu_name=response.meta['xing_zheng_qu_name']
pin_pai_name=response.meta['pin_pai_name']
fu_wu_name=response.meta['fu_wu_name']
time=response.xpath("//div[@class='facilities']/p/span/text()").getall()
#print(time)
if not time:
time=None
items=YiLongItem()
items['name']=name
items['jie_ge'] = jie_ge
items['ping_fen'] = ping_fen
items['dian_ping']=dian_ping
items['di_qu'] = di_qu
items['hotelid'] =hotelid
items['xing_ji'] = xing_ji
items['xing_zheng_qu_name'] = xing_zheng_qu_name
items['pin_pai_name'] = pin_pai_name
items['fu_wu_name'] = fu_wu_name
items['time'] = time
yield items