当前位置: 首页 > 工具软件 > BeautifulSoup > 使用案例 >

BeautifulSoup库的基本使用

阎星华
2023-12-01

1.库的基本使用

1.1 安装库

pip install beautifulsoup4==4.9.1
pip install lxml  #  解析器更加强大,速度更快

1.2 导入库

from bs4 import BeautifulSoup

1.3 不同文件操作

  • html
    soup=BeautifulSoup(html,'lxml') 
    print(soup) # 打印html
    print(soup.prettify()) # 打印美化的html【索引格式化的html】
    
  • html文件
    soup = BeautifulSoup(open("index.html"),'lxml')
    

2. 获取html结构化数据

  • 获取标签title
    soup.title
    
  • 获取标签head
    soup.head
    
  • 获取标签name
    soup.title.name
    
  • 获取标签title内的内容
    soup.title.string
    
  • 获取title父级标签
    soup.title.parent
    
  • 获取title父级标签名称
    soup.title.parent.name
    
  • 获取标签属性
    soup.div.attrs
    
  • 获取所有文字内容
    print(soup.get_text())
    

3.节点操作

3.1 获取子节点

from bs4 import BeautifulSoup
html_doc = """<html><div><p>p content</p></div><html>"""
soup = BeautifulSoup(html_doc, 'lxml')
  • 获取div节点
    div_tag = soup.div
    print(div_tag) # <div><p>p content</p></div>
    
  • 获取div节点内部内容
    print(div_tag.contents)#[<p>p content</p>] 列表
    title_tag = div_tag.contents[0]
    print(title_tag) #  <p>p content</p>
    
  • 获取div节点内部p标签内容
    title_tag = div_tag.contents[0]
    print(title_tag) #  <p>p content</p>
    print(title_tag.contents) # ['p content']
    
  • 获取所有子节点【直接子节点】
    for child in soup.p.children:
    	print(child)
    
  • 获取所有子节点【子孙后代节点递归循环】
    for child in divs.descendants:
    	print(child)
    

3.2 获取节点内容

3.2.1 单个内容

from bs4 import BeautifulSoup
html_doc = """<html><div><p>p content</p></div><html>"""
soup = BeautifulSoup(html_doc, 'lxml')
  • string【返回生成器generator
    • 获取div节点内容
      print (soup.div.string) # p content
      
    • 获取p节点内容
      print (soup.p.string) # p content
      
  • text【返回内容为字符串类型】
    • 获取div节点内容
      print (soup.div.text) # p content
      
    • 获取p节点内容
      print (soup.p.text) # p content
      

3.2.2 多个内容

from bs4 import BeautifulSoup
html_doc = """
<html>
    <head>
        <title>p data</title>
    </head>
    <body>
        <p class="title"><b>b data</b></p>
        <p class="story">
            p before
            <a href="http://example.com/a1" class="sister" id="link1">a1</a>
            <a href="http://example.com/a2" class="sister" id="link2">a2</a> 
            <a href="http://example.com/a3" class="sister" id="link3">a3</a>
            p end
        </p>
    </body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
  • strings
    • 遍历获取内容,存在空白内容
      print(soup.strings)
      for string in soup.strings:
          print(repr(string))
      # <generator object Tag._all_strings at 0x00000203DA9B5430>
      # '\n'
      # '\n'
      # 'p data'
      # '\n'
      # '\n'
      # '\n'
      # 'b data'
      # '\n'
      # '\n            p before\n            '
      # 'a1'
      # '\n'
      # 'a2'
      # '\n'
      # 'a3'
      # '\n            p end\n        '
      # '\n'
      # '\n'
      # '\n'
      
  • stripped_strings
    • 遍历获取内容,不存在空白内容
      print(soup.stripped_strings)
      for string in soup.stripped_strings:
          print(repr(string))
      # <generator object Tag.stripped_strings at 0x0000024DF5675430>
      # 'p data'
      # 'b data'
      # 'p before'
      # 'a1'
      # 'a2'
      # 'a3'
      # 'p end'
      

4. 搜索匹配节点

from bs4 import BeautifulSoup
html_doc = """
<html>
    <head>
        <title>p data</title>
    </head>
    <body>
        <p class="title"><b>b data</b></p>
        <p class="story">
            p before
            <a href="http://example.com/a1" class="sister" id="link1">a1</a>
            <a href="http://example.com/a2" class="sister" id="link2">a2</a> 
            <a href="http://example.com/a3" class="sister" id="link3">a3</a>
            p end
        </p>
    </body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')

4.1find_all

  • find_all( name , attrs , recursive , string , **kwargs )
  • 符合过滤器条件的所有节点

4.1.1 标签名查找

  • 根据name标签名查找
    print(soup.find_all("a")) # 查找a标签
    # [<a class="sister" href="http://example.com/a1" id="link1">a1</a>, <a class="sister" href="http://example.com/a2" id="link2">a2</a>, <a class="sister" href="http://example.com/a3" id="link3">a3</a>]
    
  • 根据id查找
    print(soup.find_all(id="link1"))
    # [<a class="sister" href="http://example.com/a2" id="link2">a2</a>]
    
  • 多条件标签查找【条件列表】
    print(soup.find_all(["title", "b"]))
    # [<title>p data</title>, <b>b data</b>]
    
  • 根据正则表达式查找
    import re
    for tag in soup.find_all(re.compile("^b")):
       print(tag.name)
    # body
    # b
    

4.1.2 标签属性查找

  • 根据id查找

    • id匹配
      print(soup.find_all(id="link1"))
      # [<a class="sister" href="http://example.com/a2" id="link2">a2</a>]
      
    • 匹配存在id属性的标签
      print(soup.find_all(id=True))
      #[<a class="sister" href="http://example.com/a1" id="link1">a1</a>, <a class="sister" href="http://example.com/a2" id="link2">a2</a>, <a class="sister" href="http://example.com/a3" id="link3">a3</a>]
      
  • 根据href查找

    import re
    print(soup.find_all(href=re.compile("1")))
    # [<a class="sister" href="http://example.com/a1" id="link1">a1</a>]
    
  • 根据text查找

    import re
    print(soup.find_all(text=re.compile("^a")))
    # ['a1', 'a2', 'a3']
    
  • 根据class选择器查找

    import re
    print(soup.find_all(class_=re.compile("s")))
    # [<p class="story">
    #             p before
    #             <a class="sister" href="http://example.com/a1" id="link1">a1</a>
    # <a class="sister" href="http://example.com/a2" id="link2">a2</a>
    # <a class="sister" href="http://example.com/a3" id="link3">a3</a>
    #             p end
    #         </p>, <a class="sister" href="http://example.com/a1" id="link1">a1</a>, <a class="sister" href="http://example.com/a2" id="link2">a2</a>, <a class="sister" href="http://example.com/a3" id="link3">a3</a>]
    

4.1.3 多条件匹配

import re
from bs4 import BeautifulSoup
html_doc = """
<html>
    <body>
        <p class="story" id="p1">
            <a href="http://example.com/a1" class="c1" id="link1" data_a1="data1">a1</a>
            <a href="http://example.com/a2" class="c2" id="link2">a2</a> 
        </p>
    </body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
soup.find_all(href=re.compile("1"), id='link1')
print(soup.find_all("a", class_="c1"))
print(soup.find_all(attrs={"data_a1": "data1"}))
print(soup.find_all('p', class_="story", id="p1"))
print(soup.find_all('p', attrs={"class":"story", "id":"p1"}))
# [<a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1</a>]
# [<a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1</a>]
# [<p class="story" id="p1"> <a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1</a><a class="c2" href="http://example.com/a2" id="link2">a2</a></p>]
# [<p class="story" id="p1"><a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1</a><a class="c2" href="http://example.com/a2" id="link2">a2</a></p>]

4.1.4 text 参数内容匹配

import re
from bs4 import BeautifulSoup
html_doc = """
<html>
    <body>
        <p class="story" id="p1">
            <a href="http://example.com/a1" class="c1" id="link1" data_a1="data1">a1 text</a>
            <a href="http://example.com/a2" class="c2" id="link2">a2 content</a> 
        </p>
    </body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
  • 内容匹配
    print(soup.find_all(text="a1 text")) 
    # ['a1 text']
    
  • 多条件匹配
    print(soup.find_all(text=["a1 text", "a2 content"])) 
    # ['a1 text', 'a2 content']
    
  • 模糊匹配
    print(soup.find_all(text=re.compile("n")))
    # ['a2 content']
    

4.1.5 limit 参数内容限制数量

  • 限制返回长度
from bs4 import BeautifulSoup
html_doc = """
<html>
    <body>
        <p class="story" id="p1">
            <a href="http://example.com/a1" class="c1" id="link1" data_a1="data1">a1 text</a>
            <a href="http://example.com/a2" class="c2" id="link2">a2 content</a> 
            <a href="http://example.com/a2" class="c2" id="link3">a2 content</a> 
            <a href="http://example.com/a2" class="c2" id="link4">a2 content</a> 
        </p>
    </body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.find_all("a",limit=2))
print(soup.find_all("a")[0:2])
# [<a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1 text</a>, <a class="c2" href="http://example.com/a2" id="link2">a2 content</a>]
# [<a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1 text</a>, <a class="c2" href="http://example.com/a2" id="link2">a2 content</a>]

4.2 find

  • find( name , attrs , recursive , string , **kwargs )
  • 符合过滤器条件的第一个节点
  • 只想获取第一个,就可以用find,否则用find_all
    from bs4 import BeautifulSoup
    html_doc = """
    <html>
       <body>
           <p class="story" id="p1">
               <a href="http://example.com/a1" class="c1" id="link1" data_a1="data1">a1 text</a>
               <a href="http://example.com/a2" class="c2" id="link2">a2 content</a> 
           </p>
       </body>
    </html>
    """
    soup = BeautifulSoup(html_doc, 'lxml')
    print(soup.find('a'))
    print(soup.find("notag")) # find_all()` 方法没有找到目标是返回空列表, `find()` 方法找不到目标时,返回 `None` .
    print(soup.find("p").find("a"))
    # <a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1 text</a>
    # None
    # <a class="c1" data_a1="data1" href="http://example.com/a1" id="link1">a1 text</a>
    

4.3 find_parents() 和 find_parent()

from bs4 import BeautifulSoup
html_doc = """
<html>
    <head>
        <title>p data</title>
    </head>
    <body>
        <p class="title"><b>b data</b></p>
        <p class="story">
            p before
            <a href="http://example.com/a1" class="sister" id="link1">a1</a>
            <a href="http://example.com/a2" class="sister" id="link2">a2</a> 
            <a href="http://example.com/a3" class="sister" id="link3">a3</a>
            p end
        </p>
    </body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
a_string = soup.find(text="a1")
print(a_string.find_parent()) # 亲父亲
print(a_string.find_parents()) # 所有前辈的父节点
print(a_string.find_parent("p"))

5. css选择器查找

5.1通过class查找

print(soup.select(".c1")) 

5.2 通过id查找

print(soup.select("#link1"))

5.3 组合查找

  • 组合查找
    print(soup.select("p #link2"))
    
  • 子标签查找【下一级直系元素】
    print(soup.select("p > #link2"))
    
  • 标签属性
    print(soup.find('a', class_="c1").img.attrs['src'])
    
  • 既有class也有id选择器的标签
     soup.select(".story#test")
    
  • 多个class选择器的标签
    soup.select(".story.c1")
    
  • 多个class选择器和一个id选择器的标签
    soup.select(".story.data1#book")
    

5.4 属性查找

from bs4 import BeautifulSoup
html_doc = """
<html>
    <body>
        <p class="story" id="p1">
            <a href="http://example.com/a1" class="c1" id="link1" data_a1="data1">a1 text</a>
            <a href="http://example.com/a2" class="c2" id="link2">a2 content</a> 
        </p>
    </body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.select("a[href='http://example.com/a2']"))
# [<a class="c2" href="http://example.com/a2" id="link2">a2 content</a>]

6.其他操作

6.1 查找选择框

trs=soup.select('#TableList table tr')[1:]
for tr in trs: # 遍历选择框数据
    tds=tr.select('td')
    td_list=[]
    for td in tds:
        text=td.text
        td_list.append(text)

6.2 函数使用

6.2.1 get(),获取元素属性

div.find('img').get('src')
 类似资料: