解析整个页面的html代码

谭泉

2023-03-14

问题内容：

from bs4 import BeautifulSoup
import urllib,sys
reload(sys)
sys.setdefaultencoding(“utf-8”)
r = urllib.urlopen('https://twitter.com/ndtv’).read()
soup = BeautifulSoup(r)

这不会使整个网页滚动到我想要的末尾，而只会滚动其中的一部分。

编辑：

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib,sys,requests
reload(sys)
sys.setdefaultencoding("utf-8")

class wait_for_more_than_n_elements_to_be_present(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            elements = EC._find_elements(driver, self.locator)
            return len(elements) > self.count
        except StaleElementReferenceException:
            return False

def return_html_code(url):
    driver = webdriver.Firefox()
    driver.maximize_window()
    driver.get(url)
    # initial wait for the tweets to load
    wait = WebDriverWait(driver, 10)
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
    # scroll down to the last tweet until there is no more tweets loaded
    while True:
        tweets = driver.find_elements_by_css_selector("li[data-item-id]")
        number_of_tweets = len(tweets)
        print number_of_tweets
        driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])
        try:
            wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
        except TimeoutException:
            break
    html_full_source=driver.page_source
    driver.close()
    return html_full_source


url='https://twitter.com/thecoolstacks'
#using selenium browser
html_source=return_html_code(url)
soup_selenium = BeautifulSoup(html_source)
print soup_selenium
text_tweet=[]
alltweets_selenium = soup_selenium.find_all(attrs={'data-item-type' : 'tweet'})
for tweet in alltweets_selenium:
    #Text of tweet
    html_tweet= tweet.find_all("p", class_="TweetTextSize TweetTextSize--16px js-tweet-text tweet-text")
    text_tweet.append(''.join(html_tweet[0].findAll(text=True)))    
print text_tweet

预期输出：

import requests from bs4 import BeautifulSoup      url='https://twitter.com/thecoolstacks' 
req = requests.get(url) 
soup = BeautifulSoup(req.content) 
alltweets = soup.find_all(attrs={'data-item-type' : 'tweet'}) 
print alltweets[0]

问题答案：

我仍然会坚持使用Twitter API。

另外，这是解决问题的方法selenium：

使用 Explicit Waits 并定义一个自定义的Expected Condition以等待推文加载到滚动条上
通过滚动到最后加载的推文 scrollIntoView()

实现方式：

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class wait_for_more_than_n_elements_to_be_present(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            elements = EC._find_elements(driver, self.locator)
            return len(elements) > self.count
        except StaleElementReferenceException:
            return False


url = "https://twitter.com/ndtv"
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)

# initial wait for the tweets to load
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))

# scroll down to the last tweet until there is no more tweets loaded
while True:
    tweets = driver.find_elements_by_css_selector("li[data-item-id]")
    number_of_tweets = len(tweets)

    driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])

    try:
        wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
    except TimeoutException:
        break

这将向下滚动，以将其加载到此通道中的所有现有推文所需的数量。

这是HTML解析代码段，提取了tweet：

page_source = driver.page_source
driver.close()

soup = BeautifulSoup(page_source)
for tweet in soup.select("div.tweet div.content"):
    print tweet.p.text

它打印：

Father's Day Facebook post by arrested cop Suhas Gokhale's son got nearly 10,000 likes http://goo.gl/aPqlxf  pic.twitter.com/JUqmdWNQ3c
#HWL2015 End of third quarter! Breathtaking stuff. India 2-2 Pakistan - http://sports.ndtv.com/hockey/news/244463-hockey-world-league-semifinal-india-vs-pakistan-antwerp …
Why these Kashmiri boys may miss their IIT dream http://goo.gl/9LVKfK  pic.twitter.com/gohX21Gibi
...

解析整个页面的html代码

相关阅读

相关文章

相关问答

相关工具

相关文档