-
- import urllib.request
- import xml.dom.minidom
- import sqlite3
- import threading
- import time
- class logger(object):
- def log(self,*msg):
- for i in msg:
- print(i)
- Log = logger()
- Log.log('测试下')
- class downloader(object):
-
- def __init__(self,url):
- self.url = url
-
- def download(self):
- Log.log('开始下载',self.url)
- try:
- content = urllib.request.urlopen(self.url).read()
-
-
-
- Log.log('下载完毕')
- return(content)
- except:
- Log.log('下载出错')
- return(None)
-
-
- class parser(object):
-
- def __init__(self,content):
-
- self.html = xml.dom.minidom.parseString(content)
-
- def parse(self):
- Log.log('开始提取数据')
- contents = {'content':'','url':[]}
-
- divs = self.html.getElementsByTagName('div')
-
- for div in divs:
- if div.hasAttribute('class') and /
- div.getAttribute('class') == 'content':
-
- textNode = div.childNodes[0]
- qContent = textNode.data
-
- contents['content'] = qContent
-
-
- spans = self.html.getElementsByTagName('span')
- for span in spans:
- pspan = span.parentNode
- if pspan.tagName == 'a':
-
- url = pspan.getAttribute('href')
- qid = url[10:][:-4]
-
- contents['url'].append(qid)
- Log.log('提取数据完毕')
- return(contents)
- def downloadPage(qid,db):
- url = 'http://www.qiushibaike.com/articles/'+str(qid)+'.htm'
- content = downloader(url).download()
- if content:
- contents = parser(content).parse()
- if contents['content']:
- db.updateContent(qid,contents['content'])
- for i in contents['url']:
- db.addQID(i)
- if len(contents['url']) == 2:
- db.updateStatus(qid,2)
- class downloaderPool(object):
- def __init__(self,maxLength=15):
- self.downloaders = [None]*maxLength
- self.downloadList = []
- self.db = None
-
- def setDownloadList(self,downloadList):
- self.downloadList = list(set(self.downloadList+downloadList))
-
- def setdb(self,db):
- self.db = db
-
- def daemon(self):
-
- Log.log('设置守护进程')
- for index,downloader in enumerate(self.downloaders):
- if downloader:
- if not downloader.isAlive():
- Log.log('将下载器置空',index)
- self.downloaders[index] = None
-
-
- for index,downloader in enumerate(self.downloaders):
- if not downloader:
- qid = self.getQID()
- if qid:
-
- t = threading.Thread(target=downloadPage,args=(qid,self.db))
- self.downloaders[index] = t
- t.start()
- t.join()
- Log.log('设置下载器',index)
-
- time.sleep(1)
-
- def getQID(self):
- try:
- tmp = self.downloadList[0]
- del self.downloadList[0]
- return(tmp)
- except:
- return(None)
-
- def beginDownload(self):
-
- daemon = threading.Thread(target=self.daemon)
- daemon.setDaemon(True)
- daemon.start()
- daemon.join()
-
- def getDownloader(self):
- for index,downloader in enumerate(self.downloaders):
- if not downloader:
- return(index)
- return(None)
- ADD_Q_ID = 'insert into qiushibaike(id,success) values(?,?)'
- UPDATE_Q_CONTENT = 'update qiushibaike set content=? where id=?'
- UPDATE_Q_STATUS = 'update qiushibaike set success=? where id=?'
- Q_LIST = 'select id from qiushibaike where success=?'
- Q_LIST_BY_ID = 'select count(*) from qiushibaike where id=?'
- class dbConnect(object):
-
- def __init__(self,dbpath='db.sqlite'):
- self.dbpath = dbpath
-
- def addQID(self,qid):
- Log.log('插入糗事百科',qid)
-
- cn = sqlite3.connect(self.dbpath)
- c = cn.cursor()
-
- try:
-
- c.execute(ADD_Q_ID,(qid,1))
- cn.commit()
- except:
- Log.log('添加ID出错',qid)
-
-
- c.close()
-
- cn.close()
- Log.log('插入成功')
-
- def updateContent(self,qid,content):
- Log.log('更新糗事百科',qid,content)
-
- cn = sqlite3.connect(self.dbpath)
- c = cn.cursor()
-
- c.execute(UPDATE_Q_CONTENT,(content,qid))
- cn.commit()
-
- c.close()
- cn.close()
- Log.log('更新成功')
-
- def updateStatus(self,qid,flag):
- Log.log('更新状态',qid,flag)
-
- cn = sqlite3.connect(self.dbpath)
- c = cn.cursor()
-
- c.execute(UPDATE_Q_STATUS,(flag,qid))
- cn.commit()
-
- c.close()
- cn.close()
- Log.log('更新状态成功')
-
- def getList(self,unDonloaded=1):
- Log.log('获得列表')
- l = []
-
- cn = sqlite3.connect(self.dbpath)
- c = cn.cursor()
-
- c.execute(Q_LIST,(unDonloaded,))
- rows = c.fetchall()
-
- for i in rows:
- l.append(i[0])
-
- c.close()
- cn.close()
-
- Log.log('获得列表成功')
- return(l)
- class singleDownloader(object):
- def __init__(self):
- self.downloadList = []
-
- def setdb(self,db):
- self.db = db
-
- def setDownloadList(self,downloadList):
- self.downloadList = list(set(self.downloadList+downloadList))
-
- def beginDownload(self):
- for i in self.downloadList:
- downloadPage(i,self.db)
-
- def main():
- db = dbConnect('db.sqlite')
-
-
- sp = singleDownloader()
- sp.setdb(db)
-
- dp=sp
-
- unDownloadedList = db.getList()
-
- while(len(unDownloadedList)):
-
- dp.setDownloadList(unDownloadedList)
-
- dp.beginDownload()
-
- time.sleep(1)
-
- unDownloadedList = db.getList()
- if __name__ == '__main__':
- main()
代码是没问题的,可以正常运行,但是希望做到以下2方面:
1、多线程下载
2、代码分离度更高,跟面向对象
各位看家有什么好想法,贴出来看看。