当前位置: 首页 > 工具软件 > Budejie > 使用案例 >

python3爬虫 -----爬取百思不得姐信息-------http://www.budejie.com/

童华池
2023-12-01
 1 # -*- coding:utf-8 -*-
 2 # author:zxy
 3 # Date:2018-10-21
 4 
 5 import request
 6 from lxml import etree
 7 import threading
 8 from queue import Queue
 9 import csv
10 import requests
11 
12 class Produce(threading.Thread):
13     headers = {
14         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
15                       'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
16         'Cookie': '__cfduid=ddb28ef1934faef742f7fb8911d7b33bd1540080067; UM_distinctid=16693ece9945b2-0b031da4b19f32-333b5602-1fa400-16693ece9958e4;'
17                   ' _ga=GA1.2.1950184368.1540080070; _gid=GA1.2.1249143498.1540080070; _gat=1'
18     }
19     def __init__(self,page_queue,joke_queue,*args,**kwargs):
20         super(Produce, self).__init__(*args,**kwargs)
21         self.base_domain="http://www.budejie.com"
22         self.page_queue = page_queue
23         self.joke_queue = joke_queue
24     def run(self):
25         while True:
26             if self.page_queue.empty():
27                 break
28             url=self.page_queue.get()
29             self.parse_url(url)
30 
31     def parse_url(self,url):
32         reponse=requests.get(url,headers=self.headers)
33         text=reponse.text
34         html=etree.HTML(text)
35         descs=html.xpath("//div[@class='j-r-list-c-desc']")
36         for desc in descs:
37             jokes=desc.xpath(".//text()")
38             joke="\n".join(jokes).strip()
39             link=self.base_domain+desc.xpath(".//a/@href")[0]
40             self.joke_queue.put((joke,link))
41         print("="*30+"第%s页下载完成!"%url.split('/')[-1]+"="*30)
42 
43 
44 class Consumer(threading.Thread):
45     headers = {
46         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
47                       'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
48         'Cookie': '__cfduid=ddb28ef1934faef742f7fb8911d7b33bd1540080067; UM_distinctid=16693ece9945b2-0b031da4b19f32-333b5602-1fa400-16693ece9958e4;'
49                   ' _ga=GA1.2.1950184368.1540080070; _gid=GA1.2.1249143498.1540080070; _gat=1'
50     }
51     def __init__(self,joke_queue,write,gLock,*args,**kwargs):
52         super(Consumer, self).__init__(*args,**kwargs)
53         self.joke_queue=joke_queue
54         self.write=write
55         self.gLock=gLock
56 
57     def run(self):
58         while True:
59             try:
60                 joke_info=self.joke_queue.get(timeout=40)
61                 joke,link=joke_info
62                 self.gLock.acquire()
63                 self.write.writerow((joke,link))
64                 self.gLock.release()
65             except:
66                 break
67 
68 
69 def main():
70     page_queue=Queue(100)
71     joke_queue=Queue(1000)
72     gLock=threading.Lock()
73     fp=open('baisibudejie.csv','a',newline='',encoding='utf-8')
74     writer=csv.writer(fp)
75     writer.writerow(('content','link'))
76 
77     for x in range(1,11):
78         url="http://www.budejie.com/%d"%x
79         page_queue.put(url)
80 
81     for x  in range(5):
82         t=Produce(page_queue,joke_queue)
83         t.start()
84 
85     for x in range(3):
86         t=Consumer(joke_queue,writer,gLock)
87         t.start()
88 
89 
90 if __name__ == '__main__':
91     main()

 

转载于:https://www.cnblogs.com/z-712/p/9824940.html

 类似资料: