import time
import requests
from playwright.sync_api import Playwright, sync_playwright, expect
from urllib.parse import unquote
import shutil
def get_md5(content):
import hashlib
m = hashlib.md5(content.encode())
return m.hexdigest()
def save_file():
# 点击文件另存为 保存文件到本地
import win32gui,win32con
hwnd = win32gui.FindWindow("#32770", "另存为")
hwnd_save = win32gui.FindWindowEx(hwnd, None, "Button", None)
win32gui.PostMessage(hwnd_save, win32con.WM_KEYDOWN, win32con.VK_RETURN, 0)
win32gui.PostMessage(hwnd_save, win32con.WM_KEYUP, win32con.VK_RETURN, 0)
def judge_element_exist(page, js_path, type):
# 判断标签是否存在
if type == 1:
# 元素路径需要双引号包裹
result = page.evaluate(
'''var temp = document.querySelector("%s");if(temp){(function(){return true})()}else{(function(){return false})()}''' % js_path)
else:
# 元素路径需要单引号包裹(模糊查询时 'div.gh-menu > a[href*="/mys/home"]')
result = page.evaluate('''var temp = document.querySelector('%s');if(temp){(function(){return true})()}else{(function(){return false})()}''' % js_path)
print('result:', result)
return result
def get_pdf_down_url(url_tiao):
# 获取PDF重定向链接
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Connection': 'keep-alive',
'Host': 'guide.medlive.cn',
'Referer': 'https://guide.medlive.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36',
}
# url_tiao = 'https://guide.medlive.cn/guideline/full_text_link_redirect.php?l=CMzWD6BHlMVquezOt93sI1okuBNUTADoQDTCLnHwY914Ugeaoip%2Br8%2FTilxXZ%2Fl4LmWPHxmY8GnNotB59oS8PA%3D%3D&t=pRLU9F34HdrIJYk6oKmrsA%3D%3D'
res = requests.get(url_tiao, headers=headers, allow_redirects=False)
print(res.status_code)
location_url = res.headers['Location']
print('location_url:',location_url)
pdf_url = str(location_url).replace('doi', 'doi/pdf') + '?download=true'
print('pdf_url-------', pdf_url)
return pdf_url
def get_pdf_down_status(page, pdf_url, pdf_save_path):
# 获取文件下载状态,并迁移到指定文件夹下
js = '''
(function(){
var temps = document.querySelector("body > downloads-manager").shadowRoot.querySelectorAll("downloads-item[id*='frb']");
var dics = [];
for(var temp=0;temp<temps.length;temp++){
var down_path = temps[temp].shadowRoot.querySelector("#file-icon").getAttribute('src');
var file_link = temps[temp].shadowRoot.querySelector("#file-link").getAttribute('href');
var cancel = temps[temp].shadowRoot.querySelector("#description").textContent;
var fail = temps[temp].shadowRoot.querySelector("#tag");
if(fail){fail=fail.textContent}else{fail=false}
dics.push({"down_path":down_path,"file_link":file_link, "cancel":cancel, "fail":fail})
}
return dics;
})()
'''
results = page.evaluate(js)
flug = False
for result in results:
file_link = result['file_link'].split('&')[0]
if file_link[:50] != pdf_url[:50]:
continue
down_path = unquote(result['down_path']).split('path=')[-1].split('&')[0]
cancel = result['cancel']
fail = result['fail']
print('down_path:', down_path)
print('file_link:', file_link)
print('cancel:', result['cancel'])
print('fail:', result['fail'])
if not str(cancel).replace("\n",'').replace(' ', '') and not fail:
# cancel 为True或者有值表示未下载成功,fail为True或者有值表示下载失败,或者文件不存在已删除
print('下载成功')
# 将文件从下载路径迁移到指定路径
shutil.copy(down_path, pdf_save_path)
flug = True
return flug
def username_login(page):
page.goto("http://www.medlive.cn/auth/login?service=https%3A%2F%2Fguide.medlive.cn%2Fguideline%2F3896")
# page.once("dialog", lambda dialog: dialog.dismiss())
# page.get_by_role("link", name="下载").click()
page.get_by_text("电脑登录").click()
page.locator("#username").click()
page.locator("#username").fill("")
page.locator("#showPassword").click()
page.locator("#showPassword").fill("")
page.get_by_role("button", name="登录").click()
time.sleep(3)
def run(playwright: Playwright) -> None:
executable_path = f'C:\Program Files\Google\Chrome\Application\chrome.exe'
browser = playwright.chromium.launch(executable_path=executable_path, headless=False)
# browser = playwright.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
# url = 'https://guide.medlive.cn/guideline/26790'
# url = 'https://guide.medlive.cn/guideline/26559'
# url = 'https://guide.medlive.cn/guideline/27086'
url = 'https://guide.medlive.cn/guideline/3896'
#i = 'https://guide.medlive.cn/guidelinesub/7885'
#page.goto("https://guide.medlive.cn/guideline/3896")
page.goto(url)
# cookie 注入登录,直接点击下载时需要登录
if judge_element_exist(page, 'div.pdf_list div[class*="pdf_btn"]>a', 2):
context.add_cookies([{'name': 'sess', 'value': '', 'domain': 'guide.medlive.cn', 'path': '/', 'expires': -1, 'httpOnly': False, 'secure': False, 'sameSite': 'Lax'}])
time.sleep(1)
page.goto(url)
time.sleep(3)
# 判断cookie是否失效
page.reload() # 刷新页面
icon_login = judge_element_exist(page, "li.icon.login>a", 1)
icon_user = judge_element_exist(page, "li#get_icon_user_width>a", 1)
if icon_login or not icon_user:
# 账号登录
username_login(page)
cookies = context.cookies()
print("cookies", cookies)
# 下载PDF方式1 #直接点击下载时需要登录
if judge_element_exist(page, 'div.pdf_list div[class*="pdf_btn"]>a', 2):
onclick = page.evaluate('(function(){return document.querySelector("#_article_viewer_1 > div > div.pdf_btn > a").getAttribute("onclick")})()')
down_text = page.evaluate('(function(){return document.querySelector("#_article_viewer_1 > div > div.pdf_btn > a").textContent})()')
print('onclick:', onclick)
print('down_text:', down_text)
if 'download(' in onclick or '下载' in down_text:
page.get_by_role("link", name="下载").click()
else:
print(f'{url} 不存在PDF下载链接')
return
time.sleep(0.5)
# 点击弹窗 方式1
page.evaluate('document.querySelector("div.tipMask-checkBtn").click()')
page.evaluate('document.querySelector("div.tipMask-btm.clearfix > div.tipMask-btnNext").click()')
# 点击弹窗 方式2
# page.locator(".tipMask-checkBtn").click()
# page.get_by_text("同意本协议,继续下载").click()
with page.expect_download() as download_info:
with page.expect_popup() as page1_info:
page.get_by_role("link", name="下载").click()
page1 = page1_info.value
### 下载
download = download_info.value
page.wait_for_timeout(3000)
### 下载保存的路径
pdf_oss_name = get_md5(f'{url}') + '.pdf'
print('url地址-----------', url)
print('pdf_oss_name-----------', pdf_oss_name)
download.save_as(f'./{pdf_oss_name}')
else:# 下载PDF方式2(5秒盾)
return
data_l = page.evaluate('(function(){return document.querySelector("div.one_info_L>span.icon-card1.full_text_link").getAttribute("data-l")})()')
data_t = page.evaluate('(function(){return document.querySelector("div.one_info_L>span.icon-card1.full_text_link").getAttribute("data-t")})()')
print('data_l:', data_l)
print('data_t:', data_t)
### 这是一个跳转url, 需要data_l 和 data_t 这 两个参数
url_tiao = f'https://guide.medlive.cn/guideline/full_text_link_redirect.php?l={data_l}&t={data_t}'
print('url_tiao---------', url_tiao)
pdf_url = get_pdf_down_url(url_tiao)
print('pdf_url:', pdf_url)
input('stop')
try:
page.goto(pdf_url, timeout=100000, wait_until="domcontentloaded")
except:pass
# 点击保存弹窗
save_file()
time.sleep(5)
# 打开浏览器下载页面
try:
page.goto('chrome://downloads', timeout=100000, wait_until="domcontentloaded")
except:pass
time.sleep(10)
# 指定保存路径及文件名, 获取下载路径并将文件移到当前目录
file_name = 'test.pdf'
pdf_save_path = f'./{file_name}'
for i in range(5):
time.sleep(5)
flug = get_pdf_down_status(page, pdf_url, pdf_save_path)
if flug:
break
page.reload()
with sync_playwright() as playwright:
run(playwright)