动漫之家-漫画爬取

鲜于凯歌
2023-12-01

[注意]:仅作为一个示例来讨论,请不要恶意占用服务器资源,漫画在线上看的体验反而好一些

[分析]:动漫之家的反爬虫主要是靠User-Agent,加一个headers就好了。这里主要讨论一下如何获取img_urls。

分析漫画第一章第一页可以发现,<head>标签里的第一个script脚本里已经生成了这一章所有图片的链接(中间那个加密函数分析了一下,大概就是替换吧,以后有兴趣再研究),在console里输入arr_pages,果然获取了所有图片的url,补上图库链接头部 https://images.dmzj.com/ 就得到了所有图片链接。具体做法就是用BeautifulSoup或lxml拿到script后用execjs运行一下脚本,最后取出arr_pages的值就可以爬爬爬了(记得加上Referer)。

 

<script type="text/javascript">
        var arr_img = new Array();
        var page = '';
        eval(function(p,a,c,k,e,d){e=function(c){return(c<a?'':e(parseInt(c/a)))+((c=c%a)>35?String.fromCharCode(c+29):c.toString(36))};if(!''.replace(/^/,String)){while(c--){d[e(c)]=k[c]||e(c)}k=[function(e){return d[e]}];e=function(){return'\\w+'};c=1};while(c--){if(k[c]){p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c])}}return p}('v m=m=\'["h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/x.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/y.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/u.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/A.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/z.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/B.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/r.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/p.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/o.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/q.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/s.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/t.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/n.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/w.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/J.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/M.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/L.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/O.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/P.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/Q.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/N.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/K.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/E.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/D.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/C.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/F.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/G.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/I.j","h\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%k\\/%3%2%0%1%5%4%9%a%0%d%b%7%6%8%c-%f%i%g%3%2%0%1%5%4%6%e%l\\/H.j"]\';',53,53,'91|E4|BB|E9|8B|B9|E8|A6|80|E5|A5|BA|85|E7|8A|E6|86||BC|jpg|B1|B1_ch01|pages|013|009|008|010|007|011|012|003|var|014|001|002|005|004|006|025|024|023|026|027|029|028|015|022|017|016|021|018|019|020'.split('|'),0,{}))

        ;
        var g_comic_name = "黑之契约者-漆黑之花";
        var g_chapter_name = "第01话";
        var g_comic_url = "hzqyzqhzh/";
        var g_chapter_url = "hzqyzqhzh/3505.shtml";
        var g_current_page = 1;
        var g_max_pic_count = 29;
        var g_page_base = '';
        var g_comic_id = res_id = '1499';
        var g_chapter_id = chapter_id = '3505';
        var g_comic_code = '9237261211e72bf656b9093c28f90dff';
        var arr_pages = eval(pages);
        var next_chapter_pages = '["h\/%E9%BB%91%E4%B9%8B%E5%A5%91%E7%BA%A6%E8%80%85-%E6%BC%86%E9%BB%91%E4%B9%8B%E8%8A%B1\/%E9%BB%91%E4%B9%8B%E5%A5%91%E7%BA%A6%E8%80%85-%E6%BC%86%E9%BB%91%E4%B9%8B%E8%8A%B1_ch02\/001.jpg","h\/%E9%BB%91%E4%B9%8B%E5%A5%91%E7%BA%A6%E8%80%85-%E6%BC%86%E9%BB%91%E4%B9%8B%E8%8A%B1\/%E9%BB%91%E4%B9%8B%E5%A5%91%E7%BA%A6%E8%80%85-%E6%BC%86%E9%BB%91%E4%B9%8B%E8%8A%B1_ch02\/002.jpg","h\/%E9%BB%91%E4%B9%8B%E5%A5%91%E7%BA%A6%E8%80%85-%E6%BC%86%E9%BB%91%E4%B9%8B%E8%8A%B1\/%E9%BB%91%E4%B9%8B%E5%A5%91%E7%BA%A6%E8%80%85-%E6%BC%86%E9%BB%91%E4%B9%8B%E8%8A%B1_ch02\/003.jpg"]';
        var arr_nextchapter_pages = eval(next_chapter_pages);
        var final_page_url = "/hzqyzqhzh/jump.shtml?1499_3505&fbee072bb6c4a008aae02970f932a1fc";
        var sns_sys_id = '1499_3505';
        var sns_view_point_token = 'fbee072bb6c4a008aae02970f932a1fc';
        var is_hot_comic = false;
        var is_fast_comic = true;
        var server_name = 0;
        var page_site_root = '/';
        var res_type = 1;
</script>
# 部分代码示例(爬的是食灵,不是黑契)

def download_img(url):
    res = requests.get(img_host + '/' + url, headers = headers, timeout = 3)
    
    if res.status_code == 200:
        res.encoding = 'UTF-8'
        with open('{:s}'.format(url.split('/')[-1]), 'wb+') as f:    # 用img链接最后一段命名文件,比较方便
            f.write(res.content)
    else:
        res.raise_for_status


headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
            'Referer': 'https://manhua.dmzj.com/shiling/'
            }

chapters = soup(class_ = "cartoon_online_border")[0].find_all('a', href = re.compile('/shiling'))

for chapter in chapters:
    current_path = path + '/' + title + '/' + chapter.string
    not os.path.exists(current_path) and os.mkdir(current_path)
    os.chdir(current_path)
    js =  get_html(arr_url + chapter['href']).find('script')
    ctx = execjs.compile(js.string)    # 注意不能直接运行js,得去掉标签
    imgs = ctx.eval('arr_pages')

    for img in imgs:
        download_img(img)

连接/下载速度还行,所以没考虑异常处理

 类似资料: