更新数据
import json
import scrapy
import time
import pymongo
from lxml import etree
from spider.items import AnswerItem
from spider.settings import MONGODB_PORT
from spider.settings import MONGODB_HOST
from spider.settings import MONGODB_DBNAME
from spider.settings import MONGODB_A_SHEET_NAME
from spider.settings import MONGODB_Q_SHEET_NAME
from spider.tools.time_transfer import timeTransfer
from spider.tools.time_transfer import getCurrentTime
class UpdateSpider(scrapy.Spider):
name = 'update_answer'
allowed_domains = ['www.zhihu.com']
base_url ="https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics%3Bsettings.table_of_content.enabled%3B&offset=20&limit=20&sort_by=updated"
start_urls = ['http://www.zhihu.com/']
def __init__(self, *args, **kwargs):
self.host = MONGODB_HOST
self.port = MONGODB_PORT
self.database = MONGODB_DBNAME
self.sheet = MONGODB_Q_SHEET_NAME
self.current_time = int(getCurrentTime())
client = pymongo.MongoClient(host=self.host, port=self.port)
db = client[self.database]
collection = db[self.sheet]
# print("123")
for item in collection.find():
print(item['id'])
print(item['title'])
self.start_urls.append(self.base_url.format(item['id']))
def parse(self, response):
Item = AnswerItem()
data = json.loads(response.body)
dataList = data['data']
nextUrl = data['paging']
if not nextUrl['is_end']:
self.start_urls.append(nextUrl['next'])
for d in dataList:
if int(d['created_time']) > self.current_time:
content = d['content']
# 收集到的content带有前端标签,利用lxml的etree去除标签
response = etree.HTML(text=content)
content = response.xpath('string(.)')
Item['content'] = content
Item['title'] = d['question']['title']
Item['created_time'] = timeTransfer(d['created_time'])
yield Item
print(d['question']['title'])
print("-------------------------")
print(d['excerpt'].replace("[图片]", "").replace("[视频]", ""))
print("发布时间"+timeTransfer(d['created_time']))
else:
return
pass
知乎
from scrapy.http import Request, FormRequest
from spider.items import QuestionItem
import scrapy
import json
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ["zhihu.com"]
start_urls = [
"http://www.zhihu.com/api/v4/topics/21238418/feeds/essence?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Canswer_type%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.paid_info%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=10&offset=0"]
question_index = 0
question_next_url_list = [
"http://www.zhihu.com/api/v4/topics/21238418/feeds/essence?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Canswer_type%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.paid_info%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=10&offset=0"]
answer_url = []
question_base_url = "https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics%3Bsettings.table_of_content.enabled%3B&limit=5&offset=0&platform=desktop&sort_by=default"
'''
'''
def parse(self, response):
Item = QuestionItem()
if response.body is not None:
data = json.loads(response.body,encoding='utf-8-sig')
dataList = data['data']
nextUrl = data['paging']
# 判断是否还有下一个ajax请求
if nextUrl['is_end'] or len(data['data']) == 0:
return
else:
# 加入新的url
self.question_next_url_list.append(nextUrl['next'])
for item in dataList:
dic = dict(item['target'])
# 区分是专栏还是问题
if 'question' in dic:
# 保存问题id以及标题
question_id = dic["question"]['id']
title = dic['question']['title']
print(question_id)
print(title)
Item['id'] = question_id
Item['title'] = title
yield Item
self.question_index += 1
if not (nextUrl['is_end'] or len(data['data']) == 0):
yield Request(self.question_next_url_list[self.question_index], callback=self.parse)
答案
import json
import re
import scrapy
import pymongo
from spider.settings import MONGODB_PORT
from spider.settings import MONGODB_HOST
from spider.settings import MONGODB_DBNAME
from spider.settings import MONGODB_A_SHEET_NAME
from spider.settings import MONGODB_Q_SHEET_NAME
from spider.tools.time_transfer import timeTransfer
from spider.items import AnswerItem
from lxml import etree
from scrapy.http import Request, FormRequest
class AnswerSpider(scrapy.Spider):
name = 'answer'
allowed_domains = ['www.zhihu.com']
start_urls = []
base_url = "https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics%3Bsettings.table_of_content.enabled%3B&limit=5&offset=0&platform=desktop&sort_by=default"
def __init__(self,*args, **kwargs):
self.host = MONGODB_HOST
self.port = MONGODB_PORT
self.database = MONGODB_DBNAME
self.sheet = MONGODB_Q_SHEET_NAME
client = pymongo.MongoClient(host=self.host, port=self.port)
db = client[self.database]
collection = db[self.sheet]
for item in collection.find():
print(item['id'])
print(item['title'])
self.start_urls.append(self.base_url.format(item['id']))
def parse(self, response):
# 保存为AnswerItem交给piplines处理
Item = AnswerItem()
data = json.loads(response.body)
dataList = data['data']
nextUrl = data['paging']
if not nextUrl['is_end']:
self.start_urls.append(nextUrl['next'])
for d in dataList:
content = d['content']
# 收集到的content带有前端标签,利用lxml的etree去除标签
response = etree.HTML(text=content)
content = response.xpath('string(.)')
Item['content'] = content
Item['title'] = d['question']['title']
Item['created_time'] = timeTransfer(d['created_time'])
yield Item
print(d['question']['title'])
print("-------------------------")
print(content)
print(timeTransfer(d['created_time']))
pass