pyqyery允许你对xml文档进行jquery查询,API尽可能类似jquery,pyquery使用lxml进行快速xml和html操作
pyquery是python中强大而又灵活的网页解析库,如果你觉得正则写起来太麻烦,有觉得beautifulsoup语法太难记,如果你熟悉jquery的语法那么,pyquery就是你的绝佳的选择
html = '''
<div>
<ul>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/001004">同城互助</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li'))
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/001004">同城互助</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>ç™¾åº¦ä¸€ä¸‹ï¼Œä½ å°±çŸ¥é“</title></head>
from pyquery import PyQuery as pq
doc = pq(filename='hello.html')
print(doc)
print('---' * 10)
print(doc('li'))
<div>
<ul>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">dfgdd</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">gdsfeew</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/001004">kuikuik</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">qe23rw</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">fgdfggb</a></li>
</ul>
</div>
------------------------------
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">dfgdd</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">gdsfeew</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/001004">kuikuik</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">qe23rw</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">fgdfggb</a></li>
html = '''
<div id='container'>
<ul class='list'>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/001004">同城互助</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li'))
print(doc('#container .list li'))
print(doc('#container'))
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/001004">同城互助</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/001004">同城互助</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
<div id="container">
<ul class="list">
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/001004">同城互助</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
</div>
html = '''
<div id='container'>
<ul class='list'>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list li')
# print(items)
lis = doc.find('li')
# print(lis)
dfg = lis('span')
print(dfg)
<span class="bold">同城互助</span>
items = doc('.list')
print(items)
lis = items.children()
print(lis)
<ul class="list">
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
lis = items.children('.active')
print(lis)
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
html = '''
<div id='container'>
<ul class='list'>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
</div>
'''
from pyquery import PyQuery
doc = PyQuery(html)
items = doc('.list')
print(items)
container = items.parent()
print(container)
<ul class="list">
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
<div id="container">
<ul class="list">
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
</div>
html = '''
<div id='container'>
<ul class='list'>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
</div>
'''
from pyquery import PyQuery
doc = PyQuery(html)
li = doc('.list .Sq_leftNav_forum1.active')
print(li)
print(li.siblings()) # 获取兄弟元素
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
html = '''
<div id='container'>
<ul class='list'>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
</div>
'''
from pyquery import PyQuery
doc = PyQuery(html)
li = doc('.Sq_leftNav_forum1.active') # 用Sq_leftNav_forum1 active会错
print(li)
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
html = '''
<div id='container'>
<ul class='list'>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
</div>
'''
from pyquery import PyQuery
doc = PyQuery(html)
lis = doc('li')
# print(lis)
# for li in lis:
# print(li)
# 遍历需要添加items,返回一个迭代器,否则只返回内存地址
lis = doc('li').items()
# print(lis)
for li in lis:
print(li)
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
很重要,拿图片链接才能保存二进制数据
html = '''
<div id='container'>
<ul class='list'>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
</div>
'''
from pyquery import PyQuery
doc = PyQuery(html)
a = doc('.Sq_leftNav_forum1.active a')
# print(a)
print(a.attr('href'))
/shuo/forum/001004
# 第二种写法
print(a.attr.href)
/shuo/forum/001004
html = '''
<div id='container'>
<ul class='list'>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
</div>
'''
from pyquery import PyQuery
doc = PyQuery(html)
a = doc('.Sq_leftNav_forum1.active a')
print(a)
<a href="/shuo/forum/001004"><span class="bold">同城互助</span></a>
print(a.text())
同城互助
能帮我们快速查找或筛选数据
html = '''
<div id='container'>
<ul class='list'>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
</div>
'''
from pyquery import PyQuery
doc = PyQuery(html)
li = doc('.Sq_leftNav_forum1.active')
print(li)
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
li.removeClass('active')
print(li)
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
li.addClass('active')
print(li)
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
html = '''
<div id='container'>
<ul class='list'>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
</div>
'''
from pyquery import PyQuery
doc = PyQuery(html)
li = doc('.Sq_leftNav_forum1.active')
li.attr('name', 'link')
print(li)
li.css('font-size', '14px')
print(li)
<li class="Sq_leftNav_forum1 active" name="link"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum1 active" name="link" style="font-size: 14px"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
html = '''
<div id='container'>
<ul class='list'>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
</div>
'''
from pyquery import PyQuery
doc = PyQuery(html)
wrap = doc('.list')
# print(wrap)
# print(wrap.text())
wrap.find('a').remove()
print(wrap)
<ul class="list">
<li class="Sq_leftNav_forum1"/>
<li class="Sq_leftNav_forum2"/>
<li class="Sq_leftNav_forum1 active"/>
<li class="Sq_leftNav_forum2"/>
<li class="Sq_leftNav_forum1"/>
</ul>
http://pyquery.readthedocs.io/en/latest/api.html
jQuery所有选择器都适用
html = '''
<div id='container'>
<ul class='list'>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00B002">找对象</a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/001002">新鲜事</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
</ul>
</div>
'''
from pyquery import PyQuery
doc = PyQuery(html)
li = doc('li:last') # li标签最后一个
print(li)
li = doc('li:last-child') # li标签最后一个孩子
print(li)
li = doc('li:nth-child(3)') # 从1开始,拿第三个孩子
print(li)
li = doc('li:gt(2)') # 根据索引值判断,索引从0开始 gt表示大于 lt表示小于
print(li)
li = doc('li:eq(4)') #eg表等于
print(li)
li = doc('li:contains(虞城)') # contains包含某内容 主要做内容的筛选
print(li)
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
<li class="Sq_leftNav_forum1 active"><a href="/shuo/forum/001004"><span class="bold">同城互助</span></a></li>
<li class="Sq_leftNav_forum2"><a href="/shuo/forum/007005">同城活动</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>
<li class="Sq_leftNav_forum1"><a href="/shuo/forum/00D001">虞城有爱</a></li>