思路:通过百度学术爬取DOI号,再访问scihub来获取论文下载链接
下载文献加入时间
貌似一次访问太多scihub就访问不到了,控制在10-20篇吧
最新的文献,scihub上很多都没有,所以不用sort=sc_time取最新的,改用时间范围
pages 设为10的倍数就好
可选参数不想填回车就好
# -*- coding: utf-8 -*-
# 导入所需模块
import requests
import re
import os
from urllib.request import urlretrieve
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import urllib
'''author
ander2014@163.com
'''
os.chdir(os.path.dirname(os.path.realpath(__file__))) # 设置文件保存的位置 如:os.chdir('D:/')
# 获取URL信息
def get_url(key, pages, start_p=0, start_year='-', end_year='+'): # 关键字、页数(每页10篇文献)、起始页
urls = []
for i in range(0 + start_p, pages + start_p):
# urls.append('https://xueshu.baidu.com/s?wd=' + key
# + '&pn='+str(i*10) +'&tn=SE_baiduxueshu_c1gjeupa&ie=utf-8&sort=sc_time',%())
urls.append(
'https://xueshu.baidu.com/s?wd={wd}&pn={pn}&tn=SE_baiduxueshu_c1gjeupa&ie=utf-8&sort=sc_time&bcp=2&sc_f_para=sc_tasktype={{firstAdvancedSearch}}&sc_from=&sc_as_para=&filter=sc_year={{{start_year},{end_year}}}'.format(
wd=key, pn=i * 10, start_year=start_year, end_year=end_year))
return urls
# https://xueshu.baidu.com/s?wd=banana&pn=10&tn=SE_baiduxueshu_c1gjeupa&ie=utf-8&sort=sc_time
# wd=banana "dna" "yidingyou" -(meiyou) author:(zhangsan) affs:(jigou) journal:(qikan)
# https://xueshu.baidu.com/s?wd=banana "dna" "rna" -(gg) author:(hh) affs:(aa) journal:(bb)
# &pn=50&tn=SE_baiduxueshu_c1gjeupa&cl=3&bs=banana "dna" "rna" -(gg) author:(hh)
# &ie=utf-8&bcp=2&sc_f_para=sc_tasktype={firstAdvancedSearch}&sc_from=&sc_as_para=&filter=sc_year={2018,2020}
# get_url('banana',1,2,2017,2019)
# 设置请求头
headers = {
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
'Referer': 'https://googleads.g.doubleclick.net/'
}
# 获取相关论文的DOI列表
def get_paper_link(headers, url):
response = requests.get(url=url, headers=headers)
res1_data = response.text
# 找论文链接
paper_link = re.findall(r'<h3 class=\"t c_font\">\n +\n +<a href=\"(.*)\"',
res1_data)
doi_list = [] # 用一个列表接收论文的DOI
for link in paper_link:
paper_link = link + '/' if 'https://' in link or 'http://' in link else 'https://' + link + '/'
print(paper_link)
response2 = requests.get(url=paper_link, headers=headers)
res2_data = response2.text
# 提取论文的DOI
try:
paper_doi = re.findall(r'\'doi\'}\">\n +(.*?)\n ', res2_data)
if str(10) in paper_doi[0]:
doi_list.append(paper_doi)
except:
pass
return doi_list
def getFile(save_path, url):
file_name = url.split('/')[-1]
u = urllib.request.urlopen(url)
f = open(os.path.join(save_path,file_name), 'wb')
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
print("Sucessful to download" + " " + file_name)
def doi_download(headers, key, pages, start_p=0, start_year='-', end_year='+'):
dir_path = os.getcwd()
pages_url = get_url(key, pages, start_p, start_year, end_year)
print(pages_url)
doi_list = []
for purl in pages_url:
doi_list += get_paper_link(headers, purl)
print(doi_list)
base_url = 'https://sci-hub.st/'
for doi in doi_list:
print('\n', doi, doi[0])
res = requests.post(base_url, data={'request': doi[0]})
print('\n响应结果是:', res)
print('访问的地址是:', res.url)
# debug
# soup = BeautifulSoup(res.text, features='lxml')
# print(soup)
# print(soup.button)
# pdf_URL = soup.button['onclick']
# print(pdf_URL[::-1])
# print(re.search(re.compile('location.href='),soup.button['onclick']))
# print(re.search(re.compile('fdp.*?//'), pdf_URL[::-1]).group())
try:
soup = BeautifulSoup(res.text, features='lxml')
pdf_URL = soup.button['onclick']
except:
print("解析失败!")
continue
pdf_URL = re.search(re.compile('fdp.*?//'), pdf_URL[::-1]).group()[::-1][1::]
print(pdf_URL)
if not re.search(re.compile('^https://'), pdf_URL):
pdf_URL = 'https:/' + pdf_URL
print('PDF的地址是:', pdf_URL)
name = re.search(re.compile('fdp.*?/'), pdf_URL[::-1]).group()[::-1][1::]
print('PDF文件名是:', name)
print('保存的位置在:', dir_path)
try:
print('\n正在下载')
getFile(dir_path,pdf_URL)
# r = requests.get(pdf_URL, stream=True)
# file_path = os.path.join(dir_path, name)
# with open(file_path, 'wb') as f:
# for chunk in r.iter_content(chunk_size=32):
# f.write(chunk)
# print('下载完成!')
except:
print("该文章为空")
# 检索及下载
# key start_p,pages,start_year='-',end_year='+'
# wd=banana "dna" "yidingyou" -(meiyou) author:(zhangsan) affs:(jigou) journal:(qikan)
print('关键字格式参考如下:\n', 'banana "dna" "yidingyou" -(meiyou) author:(zhangsan) affs:(jigou) journal:(qikan)\n\n')
key = input("请输入您想要下载论文的关键词(英文):")
start_p = (input("请输入查询起始页(数字,0,可选):"))
num = input("请输入下载论文总数(数字):") or 10
start_year = input("请输入论文起始年(数字,可选):")
end_year = input("请输入论文终止年(数字,可选):")
if start_p == '':
start_p = 0
else:
start_p = int(start_p)
if start_year == '':
start_year = '-'
if end_year == '':
start_year = 2000
if end_year == '':
end_year = '+'
pages = int(num) // 10
doi_download(headers, key, pages, start_p, start_year=start_year, end_year=end_year)
参考:
https://www.guanjihuan.com/archives/6846
https://blog.csdn.net/czh2637750821/article/details/115508922
https://blog.csdn.net/weixin_42430021/article/details/110738063
代码更新下2021-10-15
还有就是,记得安装lxml包,不然一个都解析不了,别问我怎么想起来的(/ω\)
大家有什么建议可以和我提,目前感觉下载方式不太方便
写不动了,下次再写按名称下载的方法