1.xpath解析之xml
from lxml import etree
# 1.专业术语
"""
树:整个HTML或者xml结构
节点:HTML中的每个标签,xml中标签就是节点
根节点:树的第一个节点,HTML的根节点就是HTML标签
属性:节点属性(html中就是标签属性)
"""
# 2.xml数据格式
# json数据和xml数据是两种通用的数据格式,用于不同语言之间的数据交流
# 1). 准备数据
xml_str = """
<supermaket>
<name>永辉超市</name>
<address>肖家河大厦</address>
<goodslist>
<goods name="泡面" price="3.5" count="20"></goods>
<goods name="矿泉水" price="2" count="50"></goods>
<goods name="面包" price="5" count="15"></goods>
</goodslist>
<goods price="35" count="20">
<name>烟</name>
</goods>
<worker_list>
<cashier name="张三" pay="4000"></cashier>
<shoppingGuide name="李四" pay="3500"></shoppingGuide>
</worker_list>
</supermaket>
"""
# 2). 创建树对象,并获取数据的根节点
supermarket = etree.XML(xml_str)
# print(supermarket)
# 3). 获取节点
# 节点对象.xpath(路径) - 根据路径找到对应的节点,返回节点对象
# a.写绝对路径,不管xpath前面的节点对象是什么,路径从根节点开始写
# 写法:/绝对路径
cashier = supermarket.xpath('/supermaket/worker_list/cashier')
print(cashier) # [<Element cashier at 0x26a1eb29200>]
worker_list = supermarket.xpath('/supermaket/worker_list')[0]
print(worker_list) # <Element worker_list at 0x26a1eb29180>
result = worker_list.xpath('/worker_list/cashier')
print(result) # []
# b.相对路径:用.来表示当前节点
# 注意:./可以省略
cashier = supermarket.xpath('./worker_list/cashier')
print(cashier) # [<Element cashier at 0x1a7a4ab9280>]
cashier = worker_list.xpath('./cashier')
print(cashier) # [<Element cashier at 0x2432e913f00>]
cashier = worker_list.xpath('cashier')
print(cashier) # [<Element cashier at 0x2432e913f00>]
# c.//路径 - 从任意位置开始全局搜索
# 查找方式和功能和xpath前的节点无关
result = supermarket.xpath('//cashier')
print(result)
# [<Element cashier at 0x10616ecc0>. ]
result = supermarket.xpath('//goods')
print(result)
# [<Element goods at 0x2220e5c9380>, <Element goods at 0x2220e5c93c0>, <Element goods at 0x2220e5c9400>]
result = supermarket.xpath('//goodslist/goods')
print(result)
# [<Element goods at 0x2220e5c9380>, <Element goods at 0x2220e5c93c0>, <Element goods at 0x2220e5c9400>]
# 4)获取节点内容
# 语法:获取节点的路径/text()
name = supermarket.xpath('./name/text()')
print(name)
names = supermarket.xpath('//name/text()')
print(names)
# 5)获取节点属性值
# 语法:获取节点的路径/@属性名
cashier = supermarket.xpath('//goods/@price')
print(cashier)
2.xpath解析之html
from lxml import etree
html = etree.HTML(open('files/test.html', encoding='utf-8').read())
h1 = html.xpath('//h1/text()')
print(h1)
h1 = html.xpath('./body/h1/text()')
print(h1)
# 1.加谓语(加条件)
# 选中标签的路径[谓语]
# a.[N] - 获取同级的相同标签的第N个
p = html.xpath('./body/p[1]/text()')
print(p)
result = html.xpath('//li[1]/p/text()')
print(result)
# 2)
# [last()] - 获取同层的最后一个标签
# [last()-N] - 获取同层的倒数第(N+1)个
result = html.xpath('./body/ul/li[last()-1]/p[last()]/text()')
print(result)
# 3)
# [position()>N]
# [position()<N]
# [position()>=N]
# [position()<=N]
result = html.xpath('./body/ul/li[position()<=2]/p/text()')
print(result)
# 4) [@属性名] - 获取有指定属性的标签
# p[@class] - 有class属性的p标签
result = html.xpath('./body/div/p[@class]/text()')
print(result)
# [@属性名='值'] - 获取有指定属性是指定值的标签
result = html.xpath('./body/div/p[@class="b"]/text()')
print(result)
# 5)
# [标签 >/</>=/<=/= 数据] - 将标签按照指定子标签的内容进行筛选
result = html.xpath('./body/ul/li[p[2]>4]/p/text()')
print(result)
result = html.xpath('./body/ul/li[p[3]>30]/p[1]/text()')
print(result)
result = html.xpath('./body/ul/li[p[1] = "面包"]/p/text()')
print(result)
# 2.通配符 :*
# 1) 表示任意标签
result = html.xpath('./body/div[@id="div1"]/*')
print(result)
result = html.xpath('./body/div[@id="div1"]/*[@class]')
print(result)
result = html.xpath('//*[@class="c1"]/text()')
print(result)
# 2) 表示任意属性
result = html.xpath('./body/div[last()]/p[@*]/text()')
print(result)
# 3. 分支(获取若干路径) - |
# 注意:一个竖线隔开的必须是两个独立的路径
result = html.xpath('./body/ul/li/p[1]/text()|./body/ul/li/p[2]/text()')
print(result)
3.html测试数据
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>商店</title>
</head>
<body>
<h1>永辉超市</h1>
<p>肖家河大厦</p>
<p>营业中</p>
<ul>
<li>
<p class="name">泡面</p>
<p class="price">3.5</p>
<p class="count">15</p>
</li>
<li>
<p class="name">矿泉水</p>
<p class="price">2</p>
<p class="count">120</p>
</li>
<li>
<p class="name">面包</p>
<p class="price">5</p>
<p class="count">42</p>
</li>
<li>
<p class="name">充电宝</p>
<p class="price">150</p>
<p class="count">10</p>
</li>
</ul>
<div>
<p id="a">p1</p>
<p class="b">p2</p>
<p class="c1">p3</p>
<p class="d">p4</p>
</div>
<div id="div1">
<p class="c1">p1</p>
<p id="p2">p2</p>
<a href="">a1</a>
<span class="c1">span1</span>
<img src="https://gimg2.baidu.com/image_search/src=http%3A%2F%2Fbpic.588ku.com%2Felement_origin_min_pic%2F17%2F06%2F13%2F5c5a1442f0ec72e59829ee10d891f224.jpg%21r650&refer=http%3A%2F%2Fbpic.588ku.com&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=jpeg?sec=1631690803&t=ddfb673477426b3255f364e59966b2f1">
</div>
</body>
</html>