pip install pyquery
PyQuery初始化有三种:
initialize.html文件代码如下:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>通过文件进行初始化PyQuery对象</title>
</head>
<body>
<ul id="container">
<li class="item">1</li>
<li class="item">2</li>
<li class="item">3</li>
</ul>
</body>
</html>
初始化代码如下:
'''
功能如下:
初始化PyQuery对象的三种方式:
1. 通过字符串(text)初始化PyQuery对象
2. 通过网址(url)初始化PyQuery对象
3. 通过文件名(filename)初始化PyQuery对象
'''
from pyquery import PyQuery as pq
html = '''
<html lang="en">
<head>
<title>PyQuery学习</title>
</head>
<body>
<ul id="container">
<li class="object-1"/>
<li class="object-2"/>
<li class="object-3"/>
</ul>
</body>
</html>
'''
# 通过字符串(text)初始化PyQuery对象
doc = pq(html)
print(doc('title').text())
# 运行结果
# PyQuery学习
# 通过网址(url)初始化PyQuery对象
doc = pq(url='http://www.baidu.com', encoding='utf-8')
print(doc('title').text())
# 运行结果
# 百度一下,你就知道
# 通过文件名(filename)初始化PyQuery对象
# 但是这种方式不适用于大文件。。。
with open('initialize.html', 'r', encoding='utf-8') as f:
res = f.read()
doc = pq(res)
print(doc('title').text())
# 运行结果
# 通过文件进行初始化PyQuery对象
注意点:如果在读取文件使报错如下:UnicodeDecodeError: 'gbk' codec can't decode byte 0xa1 in position 98: illegal multibyte sequence
可以查看如下文章进行解决:GBK解码报错解决方案
from pyquery import PyQuery as pq
html = '''
<div class='wrap'>
<div id="container">
<ul class="list">
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html">third item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
'''pyquery对象.items() 转换为生成器对象(迭代器的一种),再for循环遍历'''
data = doc('li').items() # 返回生成器对象
print(type(data))
for i in data:
print(i)
# 运行结果
# <class 'generator'>
# <li class="item-0"><a href="link1.html">first item</a></li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html">third item</a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
'''选取id为container中class为list的节点'''
ul_item = doc('#container .list')
print(type(ul_item))
print(ul_item)
#运行结果
# <class 'pyquery.pyquery.PyQuery'>
# <li class="item-0"><a href="link1.html">first item</a></li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html">third item</a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
'''选取其内部所有的li节点,find()查找所有子孙节点'''
lis = ul_item.find('li')
print(type(lis))
print(lis)
#运行结果
# <class 'pyquery.pyquery.PyQuery'>
# <li class="item-0"><a href="link1.html">first item</a></li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html">third item</a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
'''选取其内部子节点,children()查找子节点'''
lis = ul_item.children()
print(type(lis))
print(lis)
#运行结果
# <class 'pyquery.pyquery.PyQuery'>
# <li class="item-0"><a href="link1.html">first item</a></li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html">third item</a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
'''筛选出子节点中class为active的节点'''
lis = ul_item.children('.active')
print(type(lis))
print(lis)
#运行结果
# <class 'pyquery.pyquery.PyQuery'>
# <li class="item-0 active"><a href="link3.html">third item</a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
'''#parent()获取某个节点的父节点'''
container = ul_item.parent()
print(type(container))
print(container)
#运行结果
# <class 'pyquery.pyquery.PyQuery'>
# <div id="container">
# <ul class="list">
# <li class="item-0"><a href="link1.html">first item</a></li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html">third item</a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
# </ul>
# </div>
'''parents()获取某个节点的祖先节点,返回所有祖先节点'''
parents = ul_item.parents()
for parent in parents.items():
print(type(parent))
print(parent)
#运行结果
# <class 'pyquery.pyquery.PyQuery'>
# <div class="wrap">
# <div id="container">
# <ul class="list">
# <li class="item-0"><a href="link1.html">first item</a></li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html">third item</a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
# </ul>
# </div>
# </div>
# <class 'pyquery.pyquery.PyQuery'>
# <div id="container">
# <ul class="list">
# <li class="item-0"><a href="link1.html">first item</a></li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html">third item</a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
# </ul>
# </div>
'''parents()获取wrap祖先节点的数据'''
parents = ul_item.parents('.wrap')
print(type(parents))
print(parents)
#运行结果
# <class 'pyquery.pyquery.PyQuery'>
# <div class="wrap">
# <div id="container">
# <ul class="list">
# <li class="item-0"><a href="link1.html">first item</a></li>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html">third item</a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
# </ul>
# </div>
# </div>
'''siblings()获取兄弟节点'''
li = doc('.list .item-0.active')
print(li.siblings())
print(li.siblings('.active'))
#运行结果
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0"><a href="link1.html">first item</a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
方法 | 说明 |
---|---|
:first-child | 获取第一个节点 |
:last-child | 获取最后一个节点 |
:nth-child(N) | 获取第N个节点,N=1,2,… |
:nth-child(2n) | 获取偶数位置的全部节点 |
:nth-child(2n-1) | 获取奇数位置的全部节点 |
:gt(N) | 获取索引大于N的节点,N=0,1,… |
:contains(‘first item’) | 获取文本包含"first item"的节点 |
from pyquery import PyQuery as pq
html = '''
<div class='wrap'>
<div id="container">
<ul class="list">
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html">third item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
'''选择第一个节点'''
li = doc('li:first-child')
print(type(li))
print(li)
# 运行结果
# <li class="item-0"><a href="link1.html">first item</a></li>
'''获取最后一个节点'''
li = doc('li:last-child')
print(type(li))
print(li)
# 运行结果
# <li class="item-0"><a href="link5.html">fifth item</a></li>
'''获取第N个节点,N=1,2,...'''
li = doc('li:nth-child(1)') # 节点从1开始算起
print(type(li))
print(li)
# 运行结果
# <class 'pyquery.pyquery.PyQuery'>
# <li class="item-0"><a href="link1.html">first item</a></li>
'''获取偶数位置的全部节点'''
li = doc('li:nth-child(2n)') # 节点从1开始算起
print(type(li))
print(li)
# 运行结果
# <class 'pyquery.pyquery.PyQuery'>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
'''获取奇数位置的全部节点'''
li = doc('li:nth-child(2n-1)') # 节点从1开始算起
print(type(li))
print(li)
# 运行结果
# <class 'pyquery.pyquery.PyQuery'>
# <li class="item-0"><a href="link1.html">first item</a></li>
# <li class="item-0 active"><a href="link3.html">third item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
'''获取索引大于N的节点,N=0,1,...'''
li = doc('li:gt(0)') # 节点从0开始算起,>0说明排除第一个
print(type(li))
print(li)
# 运行结果
# <class 'pyquery.pyquery.PyQuery'>
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html">third item</a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
'''获取文本包含"first item"的节点'''
li = doc('li > a:contains("first item")')
print(type(li))
print(li)
# 运行结果
# <class 'pyquery.pyquery.PyQuery'>
# <a href="link1.html">first item</a>
from pyquery import PyQuery as pq
html = '''
<div class='wrap'>
<div id="container">
<ul class="list">
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html">third item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
'''获取属性值——attr()'''
print(doc('li:first-child').attr('class'))
# 运行结果
# item-0
'''获取文本——text()'''
print(doc('li:first-child').text())
# 运行结果
# first item
'''获取节点内部的HTML文本——html()'''
print(doc('li:first-child').html())
# 运行结果
# <a href="link1.html">first item</a>
PyQuery中的常见用法如下:
from pyquery import PyQuery as pq
html = '''
<div class='wrap'>
<div id="container">
<ul class="list">
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html">third item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
doc = pq(html)
'''移除指定节点——remove()'''
print(doc('li:first-child').remove()) # 删除li的第一个节点
print(doc)
# 运行结果
# <div class="wrap">
# <div id="container">
# <ul class="list">
#
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html">third item</a></li>
# <li class="item-1 active"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
# </ul>
# </div>
# </div>
'''移除class属性——remove_class()'''
print(doc('li:nth-child(3)').remove_class('active'))
print(doc)
# 运行结果
# <div class="wrap">
# <div id="container">
# <ul class="list">
#
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html">third item</a></li>
# <li class="item-1"><a href="link4.html">fourth item</a></li>
# <li class="item-0"><a href="link5.html">fifth item</a></li>
# </ul>
# </div>
# </div>
'''移除某一个属性——remove_attr()'''
print(doc('li:last-child').remove_attr('class'))
print(doc)
# 运行结果
# <div class="wrap">
# <div id="container">
# <ul class="list">
#
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html">third item</a></li>
# <li class="item-1"><a href="link4.html">fourth item</a></li>
# <li><a href="link5.html">fifth item</a></li>
# </ul>
# </div>
# </div>
'''增加class属性——addClass()'''
print(doc('li:last-child').add_class('item5'))
print(doc)
# 运行结果
# <div class="wrap">
# <div id="container">
# <ul class="list">
#
# <li class="item-1"><a href="link2.html">second item</a></li>
# <li class="item-0 active"><a href="link3.html">third item</a></li>
# <li class="item-1"><a href="link4.html">fourth item</a></li>
# <li class="item5"><a href="link5.html">fifth item</a></li>
# </ul>
# </div>
# </div>