Python 通过lxml 解析html页面自动组合xpath实例

Posted 文刀文刀

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python 通过lxml 解析html页面自动组合xpath实例相关的知识,希望对你有一定的参考价值。

#coding:utf-8
‘‘‘
@author: li.liu
‘‘‘
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionBuilder, ActionChains
from lxml import etree
import urllib
import time
import re


#url=‘http://www.baidu.com‘
url=www.woyihome.com
driver= webdriver.Chrome()
driver.get(url)
web_title=driver.title
def test1():
    head=driver.current_window_handle
    print driver.current_url
    xpathset=set()
    try:
        html1=urllib.urlopen(url).read().decode(utf-8)
        hetree=etree.HTML(html1)#lxml解析html
        lxml1=etree.ElementTree(hetree)#lxml.etree解析html
        hiter=hetree.iter()#加载到迭代器中
        #print hiter
        str1=‘‘
        
        for t in hiter:#遍历每个元素
            for item in t.items():#遍历每个元素的属性
                c=0
                d=0
                for i in item:#遍历每个属性的名字和值  
                    if i == id:#查找属性名为id的元素
                        str1 =//*[@+i+="+item[c+1]+"]##通过id属性值定位达到元素
                        xx=lxml1.xpath(str1)#查找元素
                        #print ‘\n‘,xx
                        lgx=lxml1.getpath(xx[0])#查找元素路径
                        #print lgx
                        s= lxml1.xpath(str1+//*)#查找子元素生成list列表
                        for s1 in s:#遍历所有属性为str1的子元素
                            #print s1.text
                            #print lxml1.getpath(s1)
                            for ss1 in s1.items():#遍历str1子元素的属性
                                for sss1 in ss1 :#遍历属性名和值
                                        try:
                                            #print sss1
                                            lgs1=lxml1.getpath(s1)#获取str1子元素s1的路径
                                            path_split=lgs1.split(lgx)[1]#分割子处理元素属性值的字符串
                                            str3=str+path_split#生成xpath
                                            print \n,str3
                                            xpathset.add(str3)

                                        except:pass
                    #else:   
                        #pass              
                            #print lxml1.getpath(s1)
                        #print i,‘\n‘
                    c+=1
                    d+=1
                print .,
        print \r
        ‘‘‘
        for i in xpathset:
            print i
                           
            try:
                driver.find_element_by_xpath(i).text
                driver.find_element_by_xpath(i).click()
                durll=driver.current_url
                headx=driver.window_handles
                #print headx
                print ‘当前页面地址:\n‘,durll
                time.sleep(1)
                print i,‘\n‘
                if len(headx)!=1:
                    driver.switch_to_window(headx[1])
                    durl= driver.current_url
                    print ‘当前页面地址:\n‘,durl,‘\n‘
                    if ‘101.37.179.183‘ in durl:
                        driver.close()
                        driver.switch_to_window(headx[0])
                    else:
                        k=1
                        break
                    
                    
                else:
                    driver.get(url)
                
                
            
            
            except:
                pass
            ‘‘‘
        print len(xpathset)            
            #print ‘\t‘
        
        #driver.get(‘http://101.37.179.183‘)
        #print driver.title
        
    finally:
        #driver.quit()
        print ...
    ‘‘‘    
    try:
        time.sleep(1)
        #print driver.find_element_by_xpath(‘//*[@id="wrapper"]‘),1
        #print driver.find_element_by_xpath(‘//*[@id="wrapper"]/div[2]/a[1]‘)
        driver.find_element_by_xpath(str1)
        time.sleep(50000)
    finally:
        print 3
        driver.quit()        
    ‘‘‘    
        
def test2():
    http_dict={}
    durll=‘‘
    http_dict[durll]=[]
    head=driver.current_window_handle
    xpath_dict={}
    xpathset=set()
    #try:
    html1=urllib.urlopen(url).read().decode(utf-8)
    hetree=etree.HTML(html1)#lxml解析html
    lxml1=etree.ElementTree(hetree)#lxml.etree解析html
    hiter=hetree.iter()#加载到迭代器中
    #print hiter
    hid1=lxml1.xpath(//*[@id])
    hid=lxml1.xpath(//*[@id]//*)
    for t in hid1:
        id_items=t.items()
        print t.items()#打印id属性的元素所有属性
        tpath=lxml1.getpath(t)
        print tpath#打印id属性的元素的路径
        
        
        for id in id_items:
            if id in id[0]:
                str1=//*[@id="+id[1]+"]
                xpath_dict[str1]=[]
                #print xpath_dict
                print str1
                str3=str1+//*
                print str3
                id_list= lxml1.xpath(str3)
                for idist in id_list:
                    idpath= lxml1.getpath(idist)
                    idxpathlist=idpath.split(tpath)
                    if len(idxpathlist)>1:  
                        id_xpath=str1+idxpathlist[1]
                        xpath_dict[str1].append(id_xpath)
                        #print xpath_dict[str1]
                        #print idxpathlist
                    #else:
                        #print ‘+++++++++++++++++++++++++++++++++++++++‘
                        #print idxpathlist,‘stop‘,len(idxpathlist)
                print ==============================================
    cont=0
    k=0
    
    for i in xpath_dict:
        #print xpath_dict[i]
        for t in xpath_dict[i]:
            durll=‘‘
            try:
                time.sleep(1)
                elem_text=driver.find_element_by_xpath(t).text
                driver.find_element_by_xpath(t).click()
                durll=driver.current_url
                headx=driver.window_handles
                #print headx
                
                if len(headx)!=1:
                    driver.switch_to_window(headx[1])
                    durll= driver.current_url
                    print 链接元素名:,elem_text
                    print 页面名:,driver.title
                    print 当前页面地址:\n,durll
                    print t,\n
                    if 101.37.179.183 in durll:
                        driver.close()
                        driver.switch_to_window(headx[0])
                    else:
                        k=1
                        break
                else:
                    if driver.title !=web_title:
                        print 链接元素名:,elem_text
                        print 页面名:,driver.title
                        print 当前页面地址:\n,durll
                        print t,\n
                        driver.back()
                    pass
                
            except:
                if k==1 or localhost in durll:
                    pass
                else:
                    try:
                        print 动态首项xpath:,dict[i][0]
                        elem=driver.find_element_by_xpath(xpath_dict[i][0])
                        ActionChains(driver).move_to_element(elem).perform()
                        time.sleep(1)
                        driver.find_element_by_xpath(t).click()
                        print 当前动态页面地址为:,\n,driver.current_url
                        print t,\n
                        if driver.title !=web_title:
                            t1= 链接元素名:+elem_text
                            t2= 页面名:+driver.title
                            t3= 当前页面地址:+durll
                            print t1,\n,t2,\n,t3,\n,t,\n
                            http_dict[durll].append(t1)
                            http_dict[durll].append(t2)
                            http_dict[durll].append(t3)
                            driver.back()
                    except(Exception):
                        pass
                        #print Exception
            
            cont+=1
            print cont   
    
    
    with open(E:/1/http.txt, w) as handle:
        for t in http_dict:
            str2=t+‘‘+str(http_dict[t])
            handle.writelines(str2)
        
    
    
    
    
    
    
    
    
    
test2()
print 结束
#driver.quit()

    

 

以上是关于Python 通过lxml 解析html页面自动组合xpath实例的主要内容,如果未能解决你的问题,请参考以下文章

python爬虫lxml基本用法

Python通过Lxml库解析网络爬虫抓取到的html

需要python lxml语法帮助来解析html

python HTML解析之 - lxml

Python爬虫:通过爬取CSDN博客信息,学习lxml库与XPath语法

Python爬虫:通过爬取CSDN博客信息,学习lxml库与XPath语法