当前位置: 首页 > 面试题库 >










function path($basename) {
    return '/var/tmp/www.foo.bar/' . $basename;

abstract class Browser
    private $options = [];
    private $state   = [];
    protected $cookies;

    abstract protected function path($basename);

    public function __construct($site, $options = []) {
        $this->cookies   = $this->path('cookies');
        $this->options  = array_merge(
                'site'      => $site,
                'userAgent' => 'Mozilla/5.0 (Windows NT 5.1; rv:16.0) Gecko/20100101 Firefox/16.0 - LeoScraper',
                'waitTime'  => 250000,
        $this->state = [
            'referer' => '/',
            'url'     => '',
            'curl'    => '',

     * Reactivates after sleep (e.g. in session) or creation
    public function __wakeup() {
        $this->state['curl'] = curl_init();
            CURLOPT_USERAGENT       => $this->options['userAgent'],
            CURLOPT_ENCODING        => '',
            CURLOPT_NOBODY          => false,
            // ...retrieving the body...
            CURLOPT_BINARYTRANSFER  => true,
            // ...as binary...
            CURLOPT_RETURNTRANSFER  => true,
            // ...into $ret...
            CURLOPT_FOLLOWLOCATION  => true,
            // ...following redirections...
            CURLOPT_MAXREDIRS       => 5,
            // ...reasonably...
            CURLOPT_COOKIEFILE      => $this->cookies,
            // Save these cookies
            CURLOPT_COOKIEJAR       => $this->cookies,
            // (already set above)
            CURLOPT_CONNECTTIMEOUT  => 30,
            // Seconds
            CURLOPT_TIMEOUT         => 300,
            // Seconds
            CURLOPT_LOW_SPEED_LIMIT => 16384,
            // 16 Kb/s
            CURLOPT_LOW_SPEED_TIME  => 15,

     * Imports an options array.
     * @param array $opts
     * @throws DetailedError
    private function config(array $opts = []) {
        foreach ($opts as $key => $value) {
            if (true !== curl_setopt($this->state['curl'], $key, $value)) {
                throw new \Exception('Could not set cURL option');

    private function perform($url) {
        $this->state['referer'] = $this->state['url'];
        $this->state['url'] = $url;
            CURLOPT_URL     => $this->options['site'] . $this->state['url'],
            CURLOPT_REFERER => $this->options['site'] . $this->state['referer'],
        $response = curl_exec($this->state['curl']);
        // Should we ever want to randomize waitTime, do so here.

        return $response;

     * Returns a configuration option.
     * @param string $key       configuration key name
     * @param string $value     value to set
     * @return mixed
    protected function option($key, $value = '__DEFAULT__') {
        $curr   = $this->options[$key];
        if ('__DEFAULT__' !== $value) {
            $this->options[$key]    = $value;
        return $curr;

     * Performs a POST.
     * @param $url
     * @param $fields
     * @return mixed
    public function post($url, array $fields) {
            CURLOPT_POST       => true,
            CURLOPT_POSTFIELDS => http_build_query($fields),
        return $this->perform($url);

     * Performs a GET.
     * @param       $url
     * @param array $fields
     * @return mixed
    public function get($url, array $fields = []) {
        $this->config([ CURLOPT_POST => false ]);
        if (empty($fields)) {
            $query = '';
        } else {
            $query = '?' . http_build_query($fields);
        return $this->perform($url . $query);


/* WWW_FOO_COM requires username and password to construct */

class WWW_FOO_COM_Browser extends Browser
    private $loggedIn   = false;

    public function __construct($username, $password) {
        parent::__construct('http://www.foo.bar.baz', [
            'username'  => $username,
            'password'  => $password,
            'waitTime'  => 250000,
            'userAgent' => 'FooScraper',
            'cache'     => true
        // Open the session
        // Navigate to the login page

     * Perform login.
    public function login() {
        $response = $this->post(
                'j_un'    => $this->option('username'),
                'j_pw'    => $this->option('password'),
        // TODO: verify that response is OK.
        // if (!strstr($response, "Welcome " . $this->option('username'))
        //     throw new \Exception("Bad username or password")
        $this->loggedIn = true;
        return true;

    public function scrape($entry) {
        // We could implement caching to avoid scraping the same entry
        // too often. Save $data into path("entry-" . md5($entry))
        // and verify the filemtime of said file, is it newer than time()
        // minus, say, 86400 seconds? If yes, return file_get_content and
        // leave remote site alone.
        $data = $this->get(
                'ticker' => $entry
        return $data;


    $scraper = new WWW_FOO_COM_Browser('lserni', 'mypassword');
    if (!$scraper->login()) {
        throw new \Exception("bad user or pass");
    foreach ($entries as $entry) {
        $html = $scraper->scrape($entry);
        // Parse HTML

  • 问题内容: 首先,我认为值得一提,我知道有很多类似的问题,但是没有一个对我有用。 我是Python,html和网络抓取工具的新手。我正在尝试从需要先登录的网站上抓取用户信息。在我的测试中,我以来自github的scraper我的电子邮件设置为例。主页是“ https://github.com/login ”,目标页面是“ https://github.com/settings/emails ” 这

  • 问题内容: 如果我想抓取一个需要先使用密码登录的网站,我该如何使用beautifulsoup4库开始使用python抓取它?以下是我对不需要登录的网站的处理方式。 应该如何更改代码以适应登录?假设我要抓取的网站是一个需要登录的论坛。一个示例是http://forum.arduino.cc/index.php 问题答案: 您可以使用机械化: 或urllib-使用urllib2登录网站

  • ''我想为需要登录的网站执行网页抓取。我尝试了两种不同的代码方法。我仍然无法执行登录。“”#使用BeautifulSoup在Python中开发代码: #第一种方法是从bs4导入请求导入http。cookiejar导入urllib。请求导入urllib。作语法分析 '' #

  • 本文向大家介绍对python抓取需要登录网站数据的方法详解,包括了对python抓取需要登录网站数据的方法详解的使用技巧和注意事项,需要的朋友参考一下 scrapy.FormRequest login.py selenium登录获取cookie get_cookie_by_selenium.py 获取浏览器cookie(以Ubuntu的Firefox为例) get_cookie_by_firefo

  • 问题内容: 我需要从此网站Link中抓取新闻公告。公告似乎是动态生成的。它们不会出现在源代码中。我通常使用机械化,但是我认为它不会起作用。我该怎么办?我可以使用python或perl。 问题答案: 礼貌的选择是询问网站所有者是否具有允许您访问其新闻报道的API。 不太礼貌的选择是跟踪页面加载时发生的HTTP事务,并确定哪一个是AJAX调用,该调用会提取数据。 看起来就是这个。但是看起来它可能包含会

  • 问题内容: 我遇到过许多教程,它们解释了如何使用node.js刮取不需要身份验证/登录的公共网站。 有人可以解释如何抓取需要使用node.js登录的网站吗? 问题答案: 使用Mikeal的请求库,您需要启用cookie支持,如下所示: 因此,您首先应该在该站点上(手动)创建一个用户名,并在向该站点发出POST请求时将用户名和密码作为参数传递。之后,服务器将使用Cookie进行响应,该请求将记住该C