爬取糗事百科段子,页面的URL是 http://www.qiushibaike.com/8hr/page/
-
使用requests获取页面信息,用XPath 做数据提取
-
获取每个帖子里的
用户头像链接
、用户姓名
、段子内容
、点赞次数
和评论次数
1 # -*- coding:utf-8 -*- 2 import requests 3 from lxml import etree 4 5 def loadPage(url): 6 headers = { 7 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', 8 'Accept-Language': 'zh-CN,zh;q=0.8'} 9 try: 10 response = requests.get(url, headers=headers) 11 resHtml = response.text 12 html = etree.HTML(resHtml) 13 result = html.xpath('//div[contains(@id,"qiushi_tag")]') 14 for site in result: 15 item = {} 16 imgUrl = site.xpath('./div/a/img/@src')[0].encode('utf-8') 17 #username = site.xpath('.//img/@alt')[0].encode('utf-8') 18 username = site.xpath('.//h2')[0].text 19 content = site.xpath('.//div[@class="content"]/span')[0].text.strip().encode('utf-8') 20 # 投票次数 21 vote = site.xpath('.//i')[0].text 22 #print site.xpath('.//*[@class="number"]')[0].text 23 # 评论信息 24 comments = site.xpath('.//i')[1].text 25 print imgUrl, username, content, vote, comments 26 except Exception, e: 27 print e 28 29 def qiushiSpider(url, beginPage, endPage): 30 """ 31 作用:贴吧爬虫调度器,负责组合处理每个页面的url 32 url : 贴吧url的前部分 33 beginPage : 起始页 34 endPage : 结束页 35 """ 36 for page in range(beginPage, endPage + 1): 37 pn = page 38 fullurl = url + str(pn) 39 #print fullurl 40 loadPage(fullurl) 41 #print html 42 43 if __name__ == "__main__": 44 beginPage = int(raw_input("请输入起始页:")) 45 endPage = int(raw_input("请输入结束页:")) 46 #page = 1 47 url = 'http://www.qiushibaike.com/8hr/page/' 48 qiushiSpider(url, beginPage, endPage)
保存到 json 文件内
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 import urllib2 5 import json 6 from lxml import etree 7 8 9 def loadPage(url): 10 11 headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} 12 13 request = urllib2.Request(url, headers = headers) 14 html = urllib2.urlopen(request).read() 15 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html) 16 17 text = etree.HTML(html) 18 # 返回所有段子的结点位置,contains()模糊查询方法,第一个参数是要匹配的标签,第二个参数是标签名部分内容 19 node_list = text.xpath('//div[contains(@id, "qiushi_tag")]') 20 21 items ={} 22 for node in node_list: 23 # xpath返回的列表,这个列表就这一个参数,用索引方式取出来,用户名 24 username = node.xpath('.//img/@alt')[0] 25 # 图片连接 26 image = node.xpath('.//div[@class="thumb"]//@src')#[0] 27 # 取出标签下的内容,段子内容 28 content = node.xpath('.//div[@class="content"]/span')[0].text 29 # 取出标签里包含的内容,点赞 30 zan = node.xpath('.//i')[0].text 31 # 评论 32 comments = node.xpath('.//i')[1].text 33 34 items = { 35 "username" : username, 36 "image" : image, 37 "content" : content, 38 "zan" : zan, 39 "comments" : comments 40 } 41 42 with open("qiushi.json", "a") as f: 43 f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n") 44 45 def qiushiSpider(url, beginPage, endPage): 46 """ 47 作用:贴吧爬虫调度器,负责组合处理每个页面的url 48 url : 贴吧url的前部分 49 beginPage : 起始页 50 endPage : 结束页 51 """ 52 for page in range(beginPage, endPage + 1): 53 pn = page 54 fullurl = url + str(pn) 55 #print fullurl 56 loadPage(fullurl) 57 #print html 58 59 if __name__ == "__main__": 60 beginPage = int(raw_input("请输入起始页:")) 61 endPage = int(raw_input("请输入结束页:")) 62 #page = 1 63 url = 'http://www.qiushibaike.com/8hr/page/' 64 qiushiSpider(url, beginPage, endPage)
在python3中爬取糗事百科段子:
1 import requests 2 import json 3 from lxml import etree 4 from urllib import parse 5 6 7 def loadPage(url): 8 9 headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} 10 11 response = requests.get(url, headers = headers) 12 html = response.content 13 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html) 14 15 text = etree.HTML(html) 16 # 返回所有段子的结点位置,contains()模糊查询方法,第一个参数是要匹配的标签,第二个参数是标签名部分内容 17 node_list = text.xpath('//div[contains(@id, "qiushi_tag")]') 18 19 items ={} 20 for node in node_list: 21 # xpath返回的列表,这个列表就这一个参数,用索引方式取出来,用户名 22 username = node.xpath('.//img/@alt')[0] 23 # 图片连接 24 image = node.xpath('.//div[@class="thumb"]//@src')#[0] 25 # 取出标签下的内容,段子内容 26 content = node.xpath('.//div[@class="content"]/span')[0].text 27 # 取出标签里包含的内容,点赞 28 zan = node.xpath('.//i')[0].text 29 # 评论 30 comments = node.xpath('.//i')[1].text 31 32 items = { 33 "username" : username, 34 "image" : image, 35 "content" : content, 36 "zan" : zan, 37 "comments" : comments 38 } 39 40 with open("qiushi.json", "a") as f: 41 #f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n") 42 f.write(json.dumps(items, ensure_ascii = False) + "\n") 43 def qiushiSpider(url, beginPage, endPage): 44 """ 45 作用:贴吧爬虫调度器,负责组合处理每个页面的url 46 url : 贴吧url的前部分 47 beginPage : 起始页 48 endPage : 结束页 49 """ 50 for page in range(beginPage, endPage + 1): 51 pn = page 52 fullurl = url + str(pn) 53 #print(fullurl) 54 loadPage(fullurl) 55 #print(html) 56 57 if __name__ == "__main__": 58 beginPage = int(input("请输入起始页:")) 59 endPage = int(input("请输入结束页:")) 60 #page = 1 61 url = 'http://www.qiushibaike.com/8hr/page/' 62 qiushiSpider(url, beginPage, endPage)
改进版
1 # coding=utf-8 2 import requests 3 from lxml import etree 4 import time 5 6 class QiuBai: 7 def __init__(self): 8 self.temp_url = "http://www.qiushibaike.com/8hr/page/{}" 9 self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"} 10 11 def get_url_list(self): 12 '''准备url地址的刘表''' 13 return [self.temp_url.format(i) for i in range(1,14)] 14 15 def parse_url(self,url): 16 '''发送请求,获取响应''' 17 response = requests.get(url,headers=self.headers) 18 return response.content.decode() 19 20 def get_content_list(self,html_str): 21 '''提取数据''' 22 html = etree.HTML(html_str) 23 div_list = html.xpath("//div[@id='content-left']/div") 24 content_list = [] 25 for div in div_list: 26 item = {} 27 item["user_name"] = div.xpath(".//h2/text()")[0].strip() 28 item["content"] = [i.strip() for i in div.xpath(".//div[@class='content']/span/text()")] 29 content_list.append(item) 30 return content_list 31 32 def save_content_list(self,content_list): 33 '''保存''' 34 for content in content_list: 35 print(content) 36 37 def run(self):#实现做主要逻辑 38 #1. 准备url列表 39 url_list = self.get_url_list() 40 #2. 遍历发送请求,获取响应 41 for url in url_list: 42 html_str = self.parse_url(url) 43 #3. 提取数据 44 content_list = self.get_content_list(html_str) 45 #4. 保存 46 self.save_content_list(content_list) 47 48 49 if __name__ == '__main__': 50 t1 = time.time() 51 qiubai = QiuBai() 52 qiubai.run() 53 print("total cost:",time.time()-t1)