查询python lxml库用法:lxml.de/

 

from lxml import etree
text = '''
<div>
    <ul>
        <li class="item-0"><a href="link1.html">first item</a></li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-inactive"><a href="link3.html">third item</a></li>
        <li class="item-1"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
'''
#HTML初始化,构造一个XPATH解析对象
# html = etree.HTML(text)
#通过.tostring 生成一个修正过的bytes类型的HTML代码
# result = etree.tostring(html)
# print(result.decode('utf-8'))

#读取本地html文件
# html = etree.parse('./res.html',etree.HTMLParser())
# result = etree.tostring(html)
# print(result.decode('utf-8'))

#获取所有节点
# html = etree.parse('./res.html',etree.HTMLParser())
# result = html.xpath('//*')
# print(result)

html = etree.parse('./2.html',etree.HTMLParser())
#5获取所有节点  获取本地html所有li节点
# result = html.xpath('//li')
# print(result[0])
#6子节点 获取li里面的a节点
# result = html.xpath('//li/a')
#7父节点  获取href属性叫link3.html的a节点的父节点的class属性
# result = html.xpath('//a[@href="link3.html"]/../@class')
#8属性匹配获取class为item-0的li节点
# result = html.xpath('//li[@class="item-0"]')
#9文本获取。获取所有li节点下a节点的内容
# result = html.xpath('//li/a/text()')
# result = html.xpath('//li[@class="item-0"]//text()')
#10属性获取
# result = html.xpath('//li/a/@href')
#11 属性多值匹配
# result = html.xpath('//li[contains(@class,"class_1")]/a/text()')
#12多属性匹配
# result = html.xpath('//li[contains(@class,"class_1") and @name="item"]/a/text()')
#13 按顺序选择
# result1 = html.xpath('//li[1]/a/text()')
# result2 = html.xpath('//li[last()]/a/text()')
# result3 = html.xpath('//li[position()<3]/a/text()')
# result4 = html.xpath('//li[last()-2]/a/text()')
# print(result1)
# print(result2)
# print(result3)
# print(result4)
# 14节点轴选择
#所有祖先节点
result = html.xpath('//li[1]/ancestor::*')
#祖先节点里的div
result = html.xpath('//li[1]/ancestor::div')
#attribute获取节点所有属性
result = html.xpath('//li[1]/atrribute::*')
#child获取所有直接子节点
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
#descendant,获取所有子孙节点,加要求span
result = html.xpath('//li[1]/descendant::span')
#following 获取当前节点后的所有节点,加要求只获取第二个后续节点
result = html.xpath('//li[1]/following::*[2]')
#following-sibling 获取当前节点之后的所有同级节点
result = html.xpath('//li[1]/following-sibling::*')

 

10-07 13:02