我编写了一个脚本,使用美丽的汤和硒库来浏览Vivino网站。
在这个网站上,我想存储某款葡萄酒的评论信息。
我必须使用Selenium进行动态抓取,因为评论只能按网页中的“显示更多评论”按钮访问,该按钮在向下滚动到页面顶部后出现。
我仅为一种葡萄酒修改了代码,以便您可以看到,如果需要,需要多长时间:
import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
def scroll_to_bottom_wine_page(driver):
#driver = self.browser
scroll_pause_time = 0.01 #Change time?
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
def scroll_to_bottom_review_page(driver, rating_count):
stuck_counter = 0
current_reviews_now = 0
current_reviews_previous = 0
scroll_review_pause_time = 0.8 #Change time?
stop_indicator = rating_count
time.sleep(scroll_review_pause_time)
element_inside_popup = driver.find_element_by_xpath('//*[@id="baseModal"]/div/div[2]/div[3]//a') #Reviews path
while True:
time.sleep(scroll_review_pause_time)
element_inside_popup.send_keys(Keys.END)
results_temp = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(results_temp, 'lxml')
reviews = soup.findAll("div", {"class": "card__card--2R5Wh reviewCard__reviewCard--pAEnA"})
current_reviews_now = len(reviews)
#In case there actually are less reviews than what the rating_count states, we avoid scrolling down forever
if(current_reviews_now == current_reviews_previous):
stuck_counter += 1
if (current_reviews_now > (stop_indicator)) or (stuck_counter > 2):
break
current_reviews_previous = current_reviews_now
return reviews
def get_reviews(wine_ids, wine_urls, rating_counts):
#Create a dataframe
review_info = pd.DataFrame()
#Create a driver
driver = webdriver.Chrome()
for wine_url in wine_urls:
#Pass URL to driver
driver.get(wine_url)
#We scroll down to the bottom of the wine webpage
scroll_to_bottom_wine_page(driver)
#Search for the "Show more reviews button and click it
wait = WebDriverWait(driver,40)
wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Show more reviews')))
more_reviews_button = driver.find_element_by_link_text('Show more reviews')
more_reviews_button.click()
#Scroll till we reach the number of reviews
reviews = scroll_to_bottom_review_page(driver, rating_counts)
length = len(reviews)
wine_ids_list = [wine_ids] * length
review_user_links = []
review_ratings = []
review_usernames = []
review_dates = []
review_texts = []
review_likes_count = []
review_comments_count = []
for review in reviews:
review_user_links.append([a['href'] for a in review.find_all('a', href=True)][0])
review_ratings.append(float((review.find("div", class_="rating__rating--ZZb_x")["aria-label"]).split()[1]))
review_usernames.append(str((review.find('a', {"class" : 'anchor__anchor--3DOSm reviewCard__userName--2KnRl'})).string))
review_dates.append("".join(((review.find('div', {"class" : 'reviewCard__ratingsText--1LU2T'})).text).rsplit((str(review_usernames[-1])))))
if (review.find('p', {"class" : 'reviewCard__reviewNote--fbIdd'})) is not None:
review_texts.append(str((review.find('p', {"class" : 'reviewCard__reviewNote--fbIdd'})).string))
review_texts = [item.strip() for item in review_texts]
else:
review_texts.append('None')
if (review.find("div", class_="likeButton__likeCount--82au4")) is not None:
review_likes_count.append(int(review.find("div", class_="likeButton__likeCount--82au4").text))
else:
review_likes_count.append(int(0))
if (review.find("div", class_="commentsButton__commentsCount--1_Ugm")) is not None:
review_comments_count.append(int(review.find("div", class_="commentsButton__commentsCount--1_Ugm").text))
else:
review_comments_count.append(int(0))
#We put the information in a dataframe
review_info_temp = pd.DataFrame()
review_info_temp.loc[:,'wine_id'] = wine_ids_list
review_info_temp.loc[:,'review_user_links'] = review_user_links
review_info_temp.loc[:,'review_ratings'] = review_ratings
review_info_temp.loc[:,'review_usernames'] = review_usernames
review_info_temp.loc[:,'review_dates'] = review_dates
review_info_temp.loc[:,'review_texts'] = review_texts
review_info_temp.loc[:,'review_likes_count'] = review_likes_count
review_info_temp.loc[:,'review_comments_count'] = review_comments_count
#We update the total dataframe
review_info = pd.concat([review_info,review_info_temp], axis=0, ignore_index=True)
#We close the driver
driver.quit()
return review_info
wine_id = ['123']
wine_url = ['https://www.vivino.com/vinilourenco-pai-horacio-grande-reserva/w/5154081?year=2015&price_id=21118981']
wine_rating_count = 186
start_time = time.time()
reviews_info = get_reviews(wine_id, wine_url, wine_rating_count)
elapsed_time = time.time() - start_time
print('The scrape took: ', elapsed_time) #For this particular wine, the code took 38 seconds to run
我写的脚本执行以下步骤:
>
使用特定的葡萄酒链接(即:https://www.vivino.com/vinilourenco-pai-horacio-grande-reserva/w/5154081?year=2015
然后,我向下滚动到网页的底部。
我找到并点击按钮"显示更多评论"
按下此按钮后,将出现一个弹出页面,其中包含葡萄酒评论
我在这些弹出窗口中向下滚动,直到达到一定数量的评论
我从评论中提取我需要的信息(每个评论都是一个美丽的汤对象)
问题是,如果我想搜集数千种葡萄酒的评论信息,那就要花很长时间。对于一款99条评论的葡萄酒来说,这需要35秒的时间。
有什么办法可以加快这个过程吗?
这些审查来自其api:
import requests
agent = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'}
response = requests.get('https://www.vivino.com/api/wines/5154081/reviews?year=2015&per_page=100', headers=agent)
reviews = response.json()["reviews"]
print(reviews)
我的建议是不要使用硒。Selenium应该是最后一个抓取网页的选项。相反,要学会理解网页是如何使用web浏览器开发工具发出请求的。例如,对于您发布的网页,这是您可以检索内容的URL:https://www.vivino.com/api/wines/5154081/reviews?year=2015
他们有一个API!!这样的东西很容易刮伤。
您只需要请求
,也许还需要美化团队。
headers = {"pragma": "no-cache",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
"x-requested-with": "XMLHttpRequest"}
url = "https://www.vivino.com/api/wines/5154081/reviews?year=2015&per_page=10&page=1"
resp = requests.get(url, headers=headers)
resp.json()
答案如下:
{'reviews': [{'id': 118841527,
'rating': 5.0,
'note': 'You need to taste it!! ',
'language': 'en',
'created_at': '2019-02-16T15:33:49.000Z',
'aggregated': True,
'user': {'id': 10310349,
'seo_name': 'miguellourenco0',
'alias': 'Miguel Lourenço',
'is_featured': False,
'visibility': 'all',
'image': {'location': '//images.vivino.com/avatars/0064zilphklf01a4dd1d69f.jpg',
'variations': {'large': '//thumbs.vivino.com/avatars/0064zilphklf01a4dd1d69f_300x300.jpg',
'small_square': '//thumbs.vivino.com/avatars/0064zilphklf01a4dd1d69f_50x50.jpg'}},
'statistics': {'followers_count': 14,
'followings_count': 21,
'ratings_count': 113,
'ratings_sum': 0,
'reviews_count': 90},
'background_image': None},
我试图抓取这个网站:https://www.senate.gov/general/contact_information/senators_cfm.cfm 我的代码: 问题是它实际上并没有到达网站。我在soup var中得到的HTML根本不是正确网页中的HTML。 我不知道从这里去哪里!任何和所有的帮助都将不胜感激。
我正在尝试使用Selenium在网站上自动加载“Show More”,然后想使用Beautifulsoup刮取内容。 我的代码正在运行,但没有给出期望的结果。我知道我做错了什么,但找不到它。对于Selenium:我的代码是单击“显示更多”按钮,但它不一致。有时它会点击5次,有时会点击10次。我希望它一直运行到最后一个“显示更多”。我不明白我做错了什么。对于Beautifulsoup:除了加载更多内
问题内容: 我正在尝试从Google搜索结果中提取链接。检查元素告诉我,我感兴趣的部分具有“ class = r”。第一个结果如下所示: 要提取“ href”,我要做: 但是我意外地得到: 我想要的地方: 属性“ ping”似乎使它感到困惑。有任何想法吗? 问题答案: 发生了什么? 如果您打印响应内容(即),则会看到您得到的HTML完全不同。页面源和响应内容不匹配。 因为内容是动态加载的,所以 不
我想从一个网站上提取日期。我想知道新闻文章发表的日期/时间。这是我的代码: 从bs4导入组导入请求 您也可以尝试使用此网站: 请查看,这是我想浏览的网站,我想获取的日期/时间是: 这是我的css选择器,它只给我段落,但日期/时间不在段落中,而是在字体标记中。 但当我将css选择器设置为: 我得到[] 是否可以提取不在任何子标记中的数据?
我已经成功地编写了从第一页抓取数据的代码,现在我不得不在这段代码中编写一个循环来抓取下一个“n”页。下面是代码 如果有人能指导/帮助我编写代码,从剩余页面中提取数据,我将不胜感激。 谢谢
我试图使用美丽的汤抓取newegg的产品名称、描述、价格和图像。我有以下bs4.element.标签类型,我想从标签中提取“src”链接。以下是我的标签: 我怎样才能提取 从这个标签?我试过了 但我收到了Keyerror。