from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib,requests,unidecode,lxml,pdb
from pyvirtualdisplay import Display
from xvfbwrapper import Xvfb
class wait_for_more_than_n_elements_to_be_present(object):
def init(self, locator, count):
self.locator = locator
self.count = count
def __call__(self, driver):
try:
elements = EC._find_elements(driver, self.locator)
return len(elements) > self.count
except StaleElementReferenceException:
return False
def return_html_code(url):
print url #added in edit 1
vdisplay =Xvfb()
vdisplay.start()
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)
# initial wait for the tweets to load
wait = WebDriverWait(driver, 240)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets = driver.find_elements_by_css_selector("li[data-item-id]")
print len(tweets) #added in edit 1
driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1])
try:
wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break
html_full_source=driver.page_source
driver.close()
vdisplay.stop()
html_full=return_html_code(url)
输出:
https://twitter.com/search?q=Error%20Check&src=typd&lang=en
20
39
56
74
我上面的代码可以无休止地滚动页面。但是它似乎以某种方式停止了。
编辑1:
$ phantomjs --version
2.1.1
在运行@alexce代码时,它在2次运行中显示了不同的输出,日期检查清楚地表明还有更多推文:
https://twitter.com/search?q=Error%20Check&src=typd&lang=en
20
40
59
76
95
114
133
152
171
191
211
231
249
267
Date of most old tweet: 12 Jan 2016
https://twitter.com/search?q=Error%20Check&src=typd&lang=en
20
40
59
76
95
114
133
152
171
191
211
231
249
267
287
303
317
337
356
373
388
400
418
437
457
476
492
Date of most old tweet: 8 Jan 2016
编辑2:
在运行@alexce代码的更新版本时。约7000条鸣叫后显示以下错误。
Traceback (most recent call last):
File "twitter_script.py", line 82, in <module>
search_twitter('Alcoholics Anonymous')
File "twitter_script.py", line 76, in search_twitter
db_name=write_data_to_db(*get_twitter_data(query))
File "twitter_script.py", line 24, in get_twitter_data
html_full=return_html_code(url)
File "c:\Users\sony\Desktop\social_network_extract_old\social_network_extract\scrollDownHtmlCode.py", line 48, in return_html_code
html_full_source=driver.page_source
File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 464, in page_source
return self.execute(Command.GET_PAGE_SOURCE)['value']
File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 199, in execute
response = self.command_executor.execute(driver_command, params)
File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 395, in execute
return self._request(command_info[0], url, body=data)
File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 463, in _request
resp = opener.open(request, timeout=self._timeout)
File "c:\Anaconda\lib\urllib2.py", line 431, in open
response = self._open(req, data)
File "c:\Anaconda\lib\urllib2.py", line 449, in _open
'_open', req)
File "c:\Anaconda\lib\urllib2.py", line 409, in _call_chain
result = func(*args)
File "c:\Anaconda\lib\urllib2.py", line 1227, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "c:\Anaconda\lib\urllib2.py", line 1200, in do_open
r = h.getresponse(buffering=True)
File "c:\Anaconda\lib\httplib.py", line 1136, in getresponse
response.begin()
File "c:\Anaconda\lib\httplib.py", line 453, in begin
version, status, reason = self._read_status()
File "c:\Anaconda\lib\httplib.py", line 409, in _read_status
line = self.fp.readline(_MAXLINE + 1)
File "c:\Anaconda\lib\socket.py", line 480, in readline
data = self._sock.recv(self._rbufsize)
socket.error: [Errno 10054] An existing connection was forcibly closed by the remote host
编辑3:针对不同的网址尝试相同的代码。
https://twitter.com/search?q=Alcoholics%20Anonymous%20Drunk%20since%3A2006-03-24%20until%3A2006-04-23&src=typd&lang=en
Traceback (most recent call last):
File "twitter_script.py", line 64, in <module>
search_twitter('Alcoholics Anonymous Drunk')
File "twitter_script.py", line 58, in search_twitter
db_name=write_data_to_db(*get_twitter_data(query))
File "twitter_script.py", line 31, in get_twitter_data
html_full=return_html_code(url)
File "c:\Users\sony\Desktop\social_network_extract_old\social_network_extract\scrollDownHtmlCode.py", line 30, in return_html_code
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
File "c:\Anaconda\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
Screenshot: available via screen
编辑4:
ubuntu@ip-172-31-38-123:~/social_network_extract_proxy$ cat error.txt
Traceback (most recent call last):
File "twitter_script.py", line 70, in <module>
search_twitter('alcoholics anonymous')
File "twitter_script.py", line 64, in search_twitter
db_name=write_data_to_db(*get_twitter_data(query))
File "twitter_script.py", line 37, in get_twitter_data
html_full=return_html_code(url)
File "/home/ubuntu/social_network_extract_proxy/firefox_driver_code.py", line 35, in return_html_code
driver=webdriver.Firefox(firefox_profile=profile)
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/webdriver.py", line 79, in __init__
self.binary, timeout),
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/extension_connection.py", line 49, in __init__
self.binary.launch_browser(self.profile)
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/firefox_binary.py", line 68, in launch_browser
self._wait_until_connectable()
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/firefox_binary.py", line 106, in _wait_until_connectable
% (self.profile.path))
selenium.common.exceptions.WebDriverException: Message: Can't load the profile. Profile Dir: /tmp/tmpvFoPrE If you specified a log_file in the FirefoxBinary constructor, check it for details.
过一会儿出现上述错误。
这是使它在无头模式下对我有用的一系列方法:
PhantomJS
代码:
import time
def return_html_code(url):
dcap = dict(webdriver.DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36"
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.maximize_window()
driver.get(url)
# initial wait for the tweets to load
wait = WebDriverWait(driver, 30)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets = driver.find_elements_by_css_selector("li[data-item-id]")
number_of_tweets = len(tweets)
print(number_of_tweets)
# move to the top and then to the bottom 5 times in a row
for _ in range(5):
driver.execute_script("window.scrollTo(0, 0)")
driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1])
time.sleep(0.5)
try:
wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break
我正在拼命地尝试在一个使用Kotlin的android应用上实现无休止的滚动。所有的教程要么是无用的,因为他们没有正确地解释事情。例如:https://github.com/chetdeva/recyclerView-bindings 它看起来很有前途,但作者使用了这样的短语,比如“把这个放在你的bindingadapter中”,所以我查看了这个是什么,我找到了一个java文件,但如果你在其中插入
无限滚动用来在页面滚动到接近底部时加载新内容或进行其他操作。 在底部的无限滚动 你只需在可滚动的容器上添加“infinite-scroll”类,一般是页面滚动区域 - div.content <style type="text/css"> .infinite-scroll-preloader { margin-top:-20px; } </style> <heade
无限滚动用来在页面滚动到接近底部时加载新内容或进行其他操作。 无限滚动HTML结构 你只需在可滚动的容器上添加“infinite-scroll”类,一般是页面滚动区域 - <div class="page-content">: <div class="page"> <div class="page-content infinite-scroll" data-distance="100">
嗯,我也试着把它藏在身体里!还是不工作..?
无限滚动 iScroll集成了一套智能缓存系统,它允许对一堆近乎无限的元素的数据进行处理和重用。目前无限滚动还处于早期开发阶段,所以虽然当前版本已经足够稳定,但是还没有做好广泛推广的准备。 具体示例请参考 无限滚动示例 ,并请提交你的建议和报告bug。 作者将会尽可能快的将功能变更的细节添加上来。
滚动至底部时,加载更多数据。 基础用法 在要实现滚动加载的列表上上添加v-infinite-scroll,并赋值相应的加载方法,可实现滚动到底部时自动执行加载方法。 demo <template> <ul class="infinite-list" v-infinite-scroll="load" style="overflow:auto"> <li v-for="i in count