Python 通过lxml遍历html xpath

Posted 文刀文刀

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python 通过lxml遍历html xpath相关的知识,希望对你有一定的参考价值。

#coding:utf-8
‘‘‘
Created on 2017年10月9日

@author: li.liu
‘‘‘
from selenium import webdriver
from lxml import etree
import urllib
import urllib2
import time

#url=‘http://www.woyihome.com‘
url=http://sso.woyihome.com/sso/pc-login
#url=‘http://www.baidu.com‘
user_agent=Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36
values = {name : WHY,    
          location : SDU,    
          language : Python }    
  
headers = { User-Agent : user_agent }  
data = urllib.urlencode(values)    
req = urllib2.Request(url, data, headers) 
response = urllib2.urlopen(req)    
html1= response.read().encode(utf-8)

def test1():
    x1={}

    #html1=urllib.urlopen(url).read().decode(‘utf-8‘)
    #print html1
    hxml=etree.HTML(html1)
    #print hxml
    htree=etree.ElementTree(hxml)
    #print htree
    id_dite=htree.xpath(//*[@id])
    #print id_dite
    coun=0
    for id_items in id_dite:
        #print id_items.items()
        #print htree.getpath(id_items)       
        for id_item in id_items.items():
            #print id_item
            if id_item[0]==id:
                id_str=//*[@id="+id_item[1]+"]
                x1[id_str]=[]
                #print id_str
                id_path=htree.getpath(htree.xpath(id_str)[0])
                #print id_path
                id_str1=id_str+//*
                idelem_list=htree.xpath(id_str1)
                #print idelem_list
                for e in idelem_list:
                    if len(e.items())==0:
                        pass
                    else:
                        e_path=htree.getpath(e)
                        #print e_path                   
                        e_path1=e_path.split(id_path)
                        #print e_path1[1]
                        if len(e_path1)>1:
                            e_str=id_str+e_path1[1]
                            e_list=e_str.split(/)
                            if li in e_list[len(e_list)-1] or ul in e_list[len(e_list)-1] or span in e_list[len(e_list)-1]:
                                pass
                            else:
                                #print e_str
                                coun+=1
                                x1[id_str].append(e_str)
    ‘‘‘
    for i in x1:
    #print i
        for i1 in x1[i]:
            print i1
        
    ‘‘‘                                
    a=0
    b=0                            
    driver=webdriver.Chrome()
    driver.get(url)
    #print driver.title                            
    for i in x1:
        #print i
        for i1 in x1[i]:
            #print i1
            try:
                d=driver.find_element_by_xpath(i1)
                a+=1
                print d.text
                time.sleep(2)
                driver.find_element_by_xpath(i1).click()
                headx=driver.window_handles
                #print headx
                print 当前页面地址:\n,driver.current_url
                time.sleep(1)
                print i,\n
                if len(headx)!=1:
                    driver.switch_to_window(headx[1])
                    durl= driver.current_url
                    print 当前页面地址:\n,durl,\n
                    if woyihome in durl:
                        driver.close()
                        driver.switch_to_window(headx[0])
                    else:
                        k=1
                        break
                elif localhost in driver.current_url:
                    
                    print a
            except :
                pass
                #print b
    print a        
            
            
            
    #driver.quit()        
            
            
            
            
            
            
            
        #print ‘====================================================‘
                                
                        
                        
                        
                        
                        
                        
                            
    print coun
                
                
                
                
            
            
            
            
test1()

 

以上是关于Python 通过lxml遍历html xpath的主要内容,如果未能解决你的问题,请参考以下文章

Python 通过lxml 解析html页面自动组合xpath实例

Python爬虫:通过爬取CSDN博客信息,学习lxml库与XPath语法

Python爬虫:通过爬取CSDN博客信息,学习lxml库与XPath语法

python中的beautifulsoup和xpath有啥异同点

python爬微信公众号前10篇历史文章-lxml&xpath初探

使用lxml的Python脚本,xpath返回空列表