pip install beautifulsoup4==4.9.1
pip install lxml # 解析器更加强大,速度更快
from bs4 import BeautifulSoup
html
soup=BeautifulSoup(html,'lxml')
print(soup) # 打印html
print(soup.prettify()) # 打印美化的html【索引格式化的html】
html
文件soup = BeautifulSoup(open("index.html"),'lxml')
title
soup.title
head
soup.head
name
soup.title.name
title
内的内容soup.title.string
title
父级标签soup.title.parent
title
父级标签名称soup.title.parent.name
soup.div.attrs
print(soup.get_text())
from bs4 import BeautifulSoup
html_doc = """<html><div><p>p content</p></div><html>"""
soup = BeautifulSoup(html_doc, 'lxml')
div
节点div_tag = soup.div
print(div_tag) # <div><p>p content</p></div>
div
节点内部内容print(div_tag.contents)#[<p>p content</p>] 列表
title_tag = div_tag.contents[0]
print(title_tag) # <p>p content</p>
div
节点内部p
标签内容title_tag = div_tag.contents[0]
print(title_tag) # <p>p content</p>
print(title_tag.contents) # ['p content']
for child in soup.p.children:
print(child)
for child in divs.descendants:
print(child)
from bs4 import BeautifulSoup
html_doc = """<html><div><p>p content</p></div><html>"""
soup = BeautifulSoup(html_doc, 'lxml')
string
【返回生成器generator
】
div
节点内容print (soup.div.string) # p content
p
节点内容print (soup.p.string) # p content
text
【返回内容为字符串类型】
div
节点内容print (soup.div.text) # p content
p
节点内容print (soup.p.text) # p content
from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>p data</title>
</head>
<body>
<p class="title"><b>b data</b></p>
<p class="story">
p before
<a href="http://example.com/a1" class="sister" id="link1">a1</a>
<a href="http://example.com/a2" class="sister" id="link2">a2</a>
<a href="http://example.com/a3" class="sister" id="link3">a3</a>
p end
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
strings
print(soup.strings)
for string in soup.strings:
print(repr(string))
# <generator object Tag._all_strings at 0x00000203DA9B5430>
# '\n'
# '\n'
# 'p data'
# '\n'
# '\n'
# '\n'
# 'b data'
# '\n'
# '\n p before\n '
# 'a1'
# '\n'
# 'a2'
# '\n'
# 'a3'
# '\n p end\n '
# '\n'
# '\n'
# '\n'
stripped_strings
print(soup.stripped_strings)
for string in soup.stripped_strings:
print(repr(string))
# <generator object Tag.stripped_strings at 0x0000024DF5675430>
# 'p data'
# 'b data'
# 'p before'
# 'a1'
# 'a2'
# 'a3'
# 'p end'
from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>p data</title>
</head>
<body>
<p class="title"><b>b data</b></p>
<p class="story">
p before
<a href="http://example.com/a1" class="sister" id="link1">a1</a>
<a href="http://example.com/a2" class="sister" id="link2">a2</a>
<a href="http://example.com/a3" class="sister" id="link3">a3</a>
p end
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
find_all( name , attrs , recursive , string , **kwargs )
name
标签名查找print(soup.find_all("a")) # 查找a标签
# [<a class="sister" href="http://example.com/a1" id="link1">a1</a>, <a class="sister" href="http://example.com/a2" id="link2">a2</a>, <a class="sister" href="http://example.com/a3" id="link3">a3</a>]
id
查找print(soup.find_all(id="link1"))
# [<a class="sister" href="http://example.com/a2" id="link2">a2</a>]
print(soup.find_all(["title", "b"]))
# [<title>p data</title>, <b>b data</b>]
import re
for tag in soup.find_all(re.compile("^b")):
print(tag.name)
# body
# b
根据id
查找
id
匹配print(soup.find_all(id="link1"))
# [<a class="sister" href="http://example.com/a2" id="link2">a2</a>]
id
属性的标签print(soup.find_all(id=True))
#[<a class="sister" href="http://example.com/a1" id="link1">a1</a>, <a class="sister" href="http://example.com/a2" id="link2">a2</a>, <a class="sister" href="http://example.com/a3" id="link3">a3</a>]
根据href
查找
import re
print(soup.find_all(href=re.compile("1")))
# [<a class="sister" href="http://example.com/a1" id="link1">a1</a>]
根据text
查找
import re
print(soup.find_all(text=re.compile("^a")))
# ['a1', 'a2', 'a3']
根据class
选择器查找
import re
print(soup.find_all(class_=re.compile("s")))
# [<p class="story">
# p before
# <a class="sister" href="http://example.com/a1" id="link1">a1</a>
# <a class="sister" href="http://example.com/a2" id="link2">a2</a>
# <a class="sister" href="http://example.com/a3" id="link3">a3</a>
# p end
# </p>, <a class="sister" href="http://example.com/a1" id="link1">a1</a>, <a class="sister" href="http://example.com/a2" id="link2">a2</a>, <a class="sister" href="http://example.com/a3" id="link3">a3</a>]
import re
from bs4 import BeautifulSoup
html_doc = """
<html>
<body>
<p class="story" id="p1">
<a href="http://example.com/a1" class="c1" id="link1" data_a1="data1">a1</a>
<a href="http://example.com/a2" class="c2" id="link2">a2</a>
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
soup.find_all(href=re.compile("1"), id='link1')
print(soup.find_all("a", class_="c1"))
print(soup.find_all(attrs={"data_a1": "data1"}))
print(soup.find_all('p', class_="story", id="p1"))
print(soup.find_all('p', attrs={"class":"story", "id":"p1"}))
# [<a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1</a>]
# [<a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1</a>]
# [<p class="story" id="p1"> <a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1</a><a class="c2" href="http://example.com/a2" id="link2">a2</a></p>]
# [<p class="story" id="p1"><a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1</a><a class="c2" href="http://example.com/a2" id="link2">a2</a></p>]
import re
from bs4 import BeautifulSoup
html_doc = """
<html>
<body>
<p class="story" id="p1">
<a href="http://example.com/a1" class="c1" id="link1" data_a1="data1">a1 text</a>
<a href="http://example.com/a2" class="c2" id="link2">a2 content</a>
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.find_all(text="a1 text"))
# ['a1 text']
print(soup.find_all(text=["a1 text", "a2 content"]))
# ['a1 text', 'a2 content']
print(soup.find_all(text=re.compile("n")))
# ['a2 content']
from bs4 import BeautifulSoup
html_doc = """
<html>
<body>
<p class="story" id="p1">
<a href="http://example.com/a1" class="c1" id="link1" data_a1="data1">a1 text</a>
<a href="http://example.com/a2" class="c2" id="link2">a2 content</a>
<a href="http://example.com/a2" class="c2" id="link3">a2 content</a>
<a href="http://example.com/a2" class="c2" id="link4">a2 content</a>
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.find_all("a",limit=2))
print(soup.find_all("a")[0:2])
# [<a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1 text</a>, <a class="c2" href="http://example.com/a2" id="link2">a2 content</a>]
# [<a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1 text</a>, <a class="c2" href="http://example.com/a2" id="link2">a2 content</a>]
find( name , attrs , recursive , string , **kwargs )
find
,否则用find_all
from bs4 import BeautifulSoup
html_doc = """
<html>
<body>
<p class="story" id="p1">
<a href="http://example.com/a1" class="c1" id="link1" data_a1="data1">a1 text</a>
<a href="http://example.com/a2" class="c2" id="link2">a2 content</a>
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.find('a'))
print(soup.find("notag")) # find_all()` 方法没有找到目标是返回空列表, `find()` 方法找不到目标时,返回 `None` .
print(soup.find("p").find("a"))
# <a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1 text</a>
# None
# <a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1 text</a>
from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>p data</title>
</head>
<body>
<p class="title"><b>b data</b></p>
<p class="story">
p before
<a href="http://example.com/a1" class="sister" id="link1">a1</a>
<a href="http://example.com/a2" class="sister" id="link2">a2</a>
<a href="http://example.com/a3" class="sister" id="link3">a3</a>
p end
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
a_string = soup.find(text="a1")
print(a_string.find_parent()) # 亲父亲
print(a_string.find_parents()) # 所有前辈的父节点
print(a_string.find_parent("p"))
print(soup.select(".c1"))
print(soup.select("#link1"))
print(soup.select("p #link2"))
print(soup.select("p > #link2"))
print(soup.find('a', class_="c1").img.attrs['src'])
class
也有id
选择器的标签 soup.select(".story#test")
class
选择器的标签soup.select(".story.c1")
class
选择器和一个id
选择器的标签soup.select(".story.data1#book")
from bs4 import BeautifulSoup
html_doc = """
<html>
<body>
<p class="story" id="p1">
<a href="http://example.com/a1" class="c1" id="link1" data_a1="data1">a1 text</a>
<a href="http://example.com/a2" class="c2" id="link2">a2 content</a>
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.select("a[href='http://example.com/a2']"))
# [<a class="c2" href="http://example.com/a2" id="link2">a2 content</a>]
trs=soup.select('#TableList table tr')[1:]
for tr in trs: # 遍历选择框数据
tds=tr.select('td')
td_list=[]
for td in tds:
text=td.text
td_list.append(text)
div.find('img').get('src')