importrequestsfrom selenium importwebdriverfrom selenium.webdriver.support.wait importWebDriverWaitfrom selenium.webdriver.support importexpected_conditions as ECfrom selenium.webdriver.common.by importByfrom lxml.html importetreeimportbs4importjiebaimportwordcloudimportrefrom matplotlib importpyplotclassMovie():def __init__(self, name):
self.url= f‘https://search.douban.com/movie/subject_search?search_text={name}‘self.headers= ‘User-Agent="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)‘‘Chrome/78.0.3904.108 Safari/537.36"‘self.chrome_options=webdriver.ChromeOptions()
self.chrome_options.add_argument(‘--headless‘)
self.chrome_options.add_argument(‘--disable-gpu‘)
self.chrome_options.add_argument(self.headers)
self.browser= webdriver.Chrome(chrome_options=self.chrome_options, executable_path=‘chromedriver.exe‘)
self.wait= WebDriverWait(self.browser, 10)defget_search(self):
# 获取搜索结果,以便进一步选择
self.browser.get(self.url)
response= self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ‘.title > a‘)))ifresponse:print(‘请选择:‘)
movies=[]for i in range(10):
name=response[i].text
url= response[i].get_attribute(‘href‘)print(f‘{[i]}.{name}‘)
movies.append([name, url])
self.browser.close()returnmovieselse:print("没有搜到您要的信息,请重新输入")
self.get_search()defget_movie_info(self, movie):
# 在影片详情页面提取影片基本信息
name=movie[0]
url= movie[1]
headers= {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36‘}
resp= requests.get(url, headers=headers)try:if resp.status_code == 200:
soup= bs4.BeautifulSoup(resp.text, ‘html.parser‘)
info= soup.find(name=‘div‘, attrs={‘id‘: ‘info‘}).text
rating= soup.find(name=‘div‘, attrs={‘class‘: ‘rating_self‘})
rating_num=rating.strong.text
rating_people=rating.a.textprint(info)print(f‘评分: {rating_num}‘)print(rating_people)
text=self.get_reviews(url, headers)
self.word_cloud(name, text)else:returnNoneexceptrequests.exceptions:returnNone
@staticmethoddefget_reviews(url, headers):
#获取评论
text= ‘‘
for i in range(5):
url= f‘{url}reviews?start=i‘response= requests.get(url, headers=headers)
html=etree.HTML(response.text)
reviews= html.xpath(‘//*[@class="short-content"]/text()‘)
reviews= ‘‘.join(‘‘.join(reviews).split())
reviews= ‘‘.join(reviews.split(‘()‘))
text+=reviewsreturntext
@staticmethoddefword_cloud(name, word):
# 生成词云
name= re.sub(r‘[\/:*?"<>|
。,.?]+‘, ‘‘, name)
ls=jieba.lcut(word)
text= ‘ ‘.join(ls)
w= wordcloud.WordCloud(font_path=‘simkai.ttf‘, width=800, height=600, background_color=‘white‘)
w.generate(text)
w.to_file(f‘{name}.png‘)
pyplot.imshow(w)
pyplot.axis(False)
pyplot.show()defmain():
movie_name= input("请输入电影名称,即可查询对应的影片信息:")
m=Movie(movie_name)
movies=m.get_search()
num= input(‘请输入序号选择:‘)
num=int(num)
m.get_movie_info(movies[num])if __name__ == ‘__main__‘:
main()