laravel编写sitemap爬虫

东郭和光
2023-12-01

因为搜索引擎需要网站的sitemap地图,即网站主域名下所有的跳转地址

拿到这个需求后有没有和我们去面试的时候经常问到的一道面试题非常相似?就是:取指定目录下所有的子目录及文件。这样的需求我们肯定是要用递归来实现。具体效果,请看

大象收车-报废车就上大象收车

<?php

namespace App\Console\Commands;

use Illuminate\Console\Command;
use Illuminate\Support\Arr;

class siteMapCommand extends Command
{
    /**
     * The name and signature of the console command.
     *
     * @var string
     */
    protected $signature = 'sitemap';

    /**
     * The console command description.
     *
     * @var string
     */
    protected $description = '获取网站链接';

    // 抓取的地址
    CONST SITE_URL = 'https://www.xxxxxx.com';
    
    /**
     * Create a new command instance.
     *
     * @return void
     */
    public function __construct()
    {
        parent::__construct();
    }

    /**
     * Execute the console command.
     *
     * @return mixed
     */
    public function handle()
    {
        //
        $res = self::getUrl(self::SITE_URL);
        foreach ($res as $value) {
            file_put_contents(storage_path('app/public/').'sitemap.txt', $value."\n", FILE_APPEND);
        }
        $this->info('ok');
    }

    /**
     * 递归入口
     * @param $url
     * @return string[]
     */
    protected static function getUrl($url)
    {
        $res = [
            '/' => self::SITE_URL.'/'
        ];
        self::getHTMLFile($url, $res);
        return $res;
    }

    /**
     * 递归爬虫
     * @param $url
     * @param $arr
     */
    protected static function getHTMLFile($url, & $arr)
    {
        $dom = new \DOMDocument();
        try {
            $dom->loadHTMLFile($url);
            $xml = simplexml_import_dom($dom);
            // 链接
            $links = $xml->xpath("//a[@href]");
            // 分页
            $page = $xml->xpath("//li[@class='number']");
            if ($page) {
                foreach ($page as $val) {
                    $tmp_page = (string)$val;
                    $links[]['href'] = $url."?page=".$tmp_page;
                }
            }
            foreach ($links as $link) {
                $href_tmp = (string) $link['href'];
                $href = self::SITE_URL.$href_tmp;
                if (!preg_match("/^\/+/", $href_tmp, $is_href)) {
                    continue;
                }
                if (Arr::get($arr, $href_tmp, '')) {
                    continue;
                }
                try {
                    $dom->loadHTMLFile($href);
                    $arr[$href_tmp] = $href;
                    self::getHTMLFile($arr[$href_tmp], $arr);
                }catch(\Exception $e){
                    $arr[$href_tmp] = $href;
                }
            }

        }catch(\Exception $e){
            $tmp = str_replace(self::SITE_URL, '', $url);
            $arr[$tmp] = $url;
        }

    }

}

 类似资料: