问题：

如何使用硒和美丽的汤在Python中更快地抓取网页？

阴福

2023-03-14

我编写了一个脚本，使用美丽的汤和硒库来浏览Vivino网站。

在这个网站上，我想存储某款葡萄酒的评论信息。

我必须使用Selenium进行动态抓取，因为评论只能按网页中的“显示更多评论”按钮访问，该按钮在向下滚动到页面顶部后出现。

我仅为一种葡萄酒修改了代码，以便您可以看到，如果需要，需要多长时间：

import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd


def scroll_to_bottom_wine_page(driver):

    #driver = self.browser
    scroll_pause_time = 0.01 #Change time?
    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
 
    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
 
        # Wait to load page
        time.sleep(scroll_pause_time)
 
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

        
def scroll_to_bottom_review_page(driver, rating_count):

    stuck_counter = 0
    current_reviews_now = 0
    current_reviews_previous = 0
    scroll_review_pause_time = 0.8 #Change time?
    stop_indicator = rating_count 
    
    time.sleep(scroll_review_pause_time)
    element_inside_popup = driver.find_element_by_xpath('//*[@id="baseModal"]/div/div[2]/div[3]//a')  #Reviews path



    while True:
        time.sleep(scroll_review_pause_time)
        element_inside_popup.send_keys(Keys.END)
        results_temp = driver.execute_script("return document.documentElement.outerHTML")
        soup = BeautifulSoup(results_temp, 'lxml')    
        reviews = soup.findAll("div", {"class": "card__card--2R5Wh reviewCard__reviewCard--pAEnA"})
        current_reviews_now = len(reviews)

        #In case there actually are less reviews than what the rating_count states, we avoid scrolling down forever
        if(current_reviews_now == current_reviews_previous):
            stuck_counter += 1
        
        if (current_reviews_now > (stop_indicator)) or (stuck_counter > 2):
            break
        
        current_reviews_previous = current_reviews_now
        
    return reviews



def get_reviews(wine_ids, wine_urls, rating_counts):

    #Create a dataframe
    review_info = pd.DataFrame()
   
    #Create a driver
    driver = webdriver.Chrome()
    
    for wine_url in wine_urls:

        #Pass URL to driver
        driver.get(wine_url)

        #We scroll down to the bottom of the wine webpage
        scroll_to_bottom_wine_page(driver)

        #Search for the "Show more reviews button and click it
        wait = WebDriverWait(driver,40)
        wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'Show more reviews')))
        more_reviews_button = driver.find_element_by_link_text('Show more reviews')
        more_reviews_button.click()

        #Scroll till we reach the number of reviews 
        reviews = scroll_to_bottom_review_page(driver, rating_counts)
        length = len(reviews)
        
        wine_ids_list = [wine_ids] * length
        review_user_links = []
        review_ratings = []
        review_usernames = []
        review_dates = []
        review_texts = []
        review_likes_count = []
        review_comments_count = []

        for review in reviews:
            
            
            review_user_links.append([a['href'] for a in review.find_all('a', href=True)][0])
            review_ratings.append(float((review.find("div", class_="rating__rating--ZZb_x")["aria-label"]).split()[1]))
            review_usernames.append(str((review.find('a', {"class" : 'anchor__anchor--3DOSm reviewCard__userName--2KnRl'})).string))
            review_dates.append("".join(((review.find('div', {"class" : 'reviewCard__ratingsText--1LU2T'})).text).rsplit((str(review_usernames[-1])))))
            
            if (review.find('p', {"class" : 'reviewCard__reviewNote--fbIdd'})) is not None:
                review_texts.append(str((review.find('p', {"class" : 'reviewCard__reviewNote--fbIdd'})).string))
                review_texts = [item.strip() for item in review_texts]  
            else:
                review_texts.append('None')

            if (review.find("div", class_="likeButton__likeCount--82au4")) is not None:
                review_likes_count.append(int(review.find("div", class_="likeButton__likeCount--82au4").text))
            else:
                review_likes_count.append(int(0))

            if (review.find("div", class_="commentsButton__commentsCount--1_Ugm")) is not None:
                review_comments_count.append(int(review.find("div", class_="commentsButton__commentsCount--1_Ugm").text))
            else:
                review_comments_count.append(int(0))
                
        #We put the information in a dataframe
        review_info_temp = pd.DataFrame()
        
        review_info_temp.loc[:,'wine_id'] = wine_ids_list
        review_info_temp.loc[:,'review_user_links'] = review_user_links
        review_info_temp.loc[:,'review_ratings'] = review_ratings
        review_info_temp.loc[:,'review_usernames'] = review_usernames
        review_info_temp.loc[:,'review_dates'] = review_dates
        review_info_temp.loc[:,'review_texts'] = review_texts
        review_info_temp.loc[:,'review_likes_count'] = review_likes_count
        review_info_temp.loc[:,'review_comments_count'] = review_comments_count

        #We update the total dataframe
        review_info = pd.concat([review_info,review_info_temp], axis=0, ignore_index=True)
       
    #We close the driver
    driver.quit()
        
    return review_info


wine_id = ['123']
wine_url = ['https://www.vivino.com/vinilourenco-pai-horacio-grande-reserva/w/5154081?year=2015&price_id=21118981']
wine_rating_count = 186 

start_time = time.time()
reviews_info = get_reviews(wine_id, wine_url, wine_rating_count)
elapsed_time = time.time() - start_time
print('The scrape took: ', elapsed_time) #For this particular wine, the code took 38 seconds to run

我写的脚本执行以下步骤：

使用特定的葡萄酒链接（即：https://www.vivino.com/vinilourenco-pai-horacio-grande-reserva/w/5154081?year=2015

然后，我向下滚动到网页的底部。

我找到并点击按钮"显示更多评论"

按下此按钮后，将出现一个弹出页面，其中包含葡萄酒评论

我在这些弹出窗口中向下滚动，直到达到一定数量的评论

我从评论中提取我需要的信息（每个评论都是一个美丽的汤对象）

问题是，如果我想搜集数千种葡萄酒的评论信息，那就要花很长时间。对于一款99条评论的葡萄酒来说，这需要35秒的时间。

有什么办法可以加快这个过程吗？

共有2个答案

郭瀚海

2023-03-14

这些审查来自其api：

import requests
agent = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'}
response = requests.get('https://www.vivino.com/api/wines/5154081/reviews?year=2015&per_page=100', headers=agent)
reviews = response.json()["reviews"]
print(reviews)

艾自强

2023-03-14

我的建议是不要使用硒。Selenium应该是最后一个抓取网页的选项。相反，要学会理解网页是如何使用web浏览器开发工具发出请求的。例如，对于您发布的网页，这是您可以检索内容的URL：https://www.vivino.com/api/wines/5154081/reviews?year=2015

他们有一个API！！这样的东西很容易刮伤。

您只需要请求，也许还需要美化团队。

headers = {"pragma": "no-cache",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36",
"x-requested-with": "XMLHttpRequest"}

url = "https://www.vivino.com/api/wines/5154081/reviews?year=2015&per_page=10&page=1"

resp = requests.get(url, headers=headers)
resp.json()

答案如下：

{'reviews': [{'id': 118841527,
   'rating': 5.0,
   'note': 'You need to taste it!! ',
   'language': 'en',
   'created_at': '2019-02-16T15:33:49.000Z',
   'aggregated': True,
   'user': {'id': 10310349,
    'seo_name': 'miguellourenco0',
    'alias': 'Miguel Lourenço',
    'is_featured': False,
    'visibility': 'all',
    'image': {'location': '//images.vivino.com/avatars/0064zilphklf01a4dd1d69f.jpg',
     'variations': {'large': '//thumbs.vivino.com/avatars/0064zilphklf01a4dd1d69f_300x300.jpg',
      'small_square': '//thumbs.vivino.com/avatars/0064zilphklf01a4dd1d69f_50x50.jpg'}},
    'statistics': {'followers_count': 14,
     'followings_count': 21,
     'ratings_count': 113,
     'ratings_sum': 0,
     'reviews_count': 90},
    'background_image': None},

类似资料：

使用美丽汤抓取网页

我试图抓取这个网站：https://www.senate.gov/general/contact_information/senators_cfm.cfm 我的代码：问题是它实际上并没有到达网站。我在soup var中得到的HTML根本不是正确网页中的HTML。我不知道从这里去哪里！任何和所有的帮助都将不胜感激。
使用硒的自动化和使用Python中的美丽汤的网络抓取

我正在尝试使用Selenium在网站上自动加载“Show More”，然后想使用Beautifulsoup刮取内容。我的代码正在运行，但没有给出期望的结果。我知道我做错了什么，但找不到它。对于Selenium：我的代码是单击“显示更多”按钮，但它不一致。有时它会点击5次，有时会点击10次。我希望它一直运行到最后一个“显示更多”。我不明白我做错了什么。对于Beautifulsoup：除了加载更多内
从美丽的汤中提取href

问题内容：我正在尝试从Google搜索结果中提取链接。检查元素告诉我，我感兴趣的部分具有“ class = r”。第一个结果如下所示：要提取“ href”，我要做：但是我意外地得到：我想要的地方：属性“ ping”似乎使它感到困惑。有任何想法吗？问题答案：发生了什么？如果您打印响应内容（即），则会看到您得到的HTML完全不同。页面源和响应内容不匹配。因为内容是动态加载的，所以不
无法提取日期值从网站与Python和美丽的汤

我想从一个网站上提取日期。我想知道新闻文章发表的日期/时间。这是我的代码：从bs4导入组导入请求您也可以尝试使用此网站：请查看，这是我想浏览的网站，我想获取的日期/时间是：这是我的css选择器，它只给我段落，但日期/时间不在段落中，而是在字体标记中。但当我将css选择器设置为：我得到[] 是否可以提取不在任何子标记中的数据？
刮多个页面在python与美丽的汤

我已经成功地编写了从第一页抓取数据的代码，现在我不得不在这段代码中编写一个循环来抓取下一个“n”页。下面是代码如果有人能指导/帮助我编写代码，从剩余页面中提取数据，我将不胜感激。谢谢
从美丽汤标签中提取src

我试图使用美丽的汤抓取newegg的产品名称、描述、价格和图像。我有以下bs4.element.标签类型，我想从标签中提取“src”链接。以下是我的标签：我怎样才能提取从这个标签？我试过了但我收到了Keyerror。

如何使用硒和美丽的汤在Python中更快地抓取网页？

共有2个答案

相关问答

相关文章

相关阅读

相关工具

相关文档