#coding:utf-8 ‘‘‘ @author: li.liu ‘‘‘ from selenium import webdriver from selenium.webdriver.common.action_chains import ActionBuilder, ActionChains from lxml import etree import urllib import time import re #url=‘http://www.baidu.com‘ url=‘www.woyihome.com‘ driver= webdriver.Chrome() driver.get(url) web_title=driver.title def test1(): head=driver.current_window_handle print driver.current_url xpathset=set() try: html1=urllib.urlopen(url).read().decode(‘utf-8‘) hetree=etree.HTML(html1)#lxml解析html lxml1=etree.ElementTree(hetree)#lxml.etree解析html hiter=hetree.iter()#加载到迭代器中 #print hiter str1=‘‘ for t in hiter:#遍历每个元素 for item in t.items():#遍历每个元素的属性 c=0 d=0 for i in item:#遍历每个属性的名字和值 if i == ‘id‘:#查找属性名为id的元素 str1 =‘//*[@‘+i+‘="‘+item[c+1]+‘"]‘##通过id属性值定位达到元素 xx=lxml1.xpath(str1)#查找元素 #print ‘\n‘,xx lgx=lxml1.getpath(xx[0])#查找元素路径 #print lgx s= lxml1.xpath(str1+‘//*‘)#查找子元素生成list列表 for s1 in s:#遍历所有属性为str1的子元素 #print s1.text #print lxml1.getpath(s1) for ss1 in s1.items():#遍历str1子元素的属性 for sss1 in ss1 :#遍历属性名和值 try: #print sss1 lgs1=lxml1.getpath(s1)#获取str1子元素s1的路径 path_split=lgs1.split(lgx)[1]#分割子处理元素属性值的字符串 str3=str+path_split#生成xpath print ‘\n‘,str3 xpathset.add(str3) except:pass #else: #pass #print lxml1.getpath(s1) #print i,‘\n‘ c+=1 d+=1 print ‘.‘, print ‘\r‘ ‘‘‘ for i in xpathset: print i try: driver.find_element_by_xpath(i).text driver.find_element_by_xpath(i).click() durll=driver.current_url headx=driver.window_handles #print headx print ‘当前页面地址:\n‘,durll time.sleep(1) print i,‘\n‘ if len(headx)!=1: driver.switch_to_window(headx[1]) durl= driver.current_url print ‘当前页面地址:\n‘,durl,‘\n‘ if ‘101.37.179.183‘ in durl: driver.close() driver.switch_to_window(headx[0]) else: k=1 break else: driver.get(url) except: pass ‘‘‘ print len(xpathset) #print ‘\t‘ #driver.get(‘http://101.37.179.183‘) #print driver.title finally: #driver.quit() print ‘...‘ ‘‘‘ try: time.sleep(1) #print driver.find_element_by_xpath(‘//*[@id="wrapper"]‘),1 #print driver.find_element_by_xpath(‘//*[@id="wrapper"]/div[2]/a[1]‘) driver.find_element_by_xpath(str1) time.sleep(50000) finally: print 3 driver.quit() ‘‘‘ def test2(): http_dict={} durll=‘‘ http_dict[durll]=[] head=driver.current_window_handle xpath_dict={} xpathset=set() #try: html1=urllib.urlopen(url).read().decode(‘utf-8‘) hetree=etree.HTML(html1)#lxml解析html lxml1=etree.ElementTree(hetree)#lxml.etree解析html hiter=hetree.iter()#加载到迭代器中 #print hiter hid1=lxml1.xpath(‘//*[@id]‘) hid=lxml1.xpath(‘//*[@id]//*‘) for t in hid1: id_items=t.items() print t.items()#打印id属性的元素所有属性 tpath=lxml1.getpath(t) print tpath#打印id属性的元素的路径 for id in id_items: if ‘id‘ in id[0]: str1=‘//*[@id="‘+id[1]+‘"]‘ xpath_dict[str1]=[] #print xpath_dict print str1 str3=str1+‘//*‘ print str3 id_list= lxml1.xpath(str3) for idist in id_list: idpath= lxml1.getpath(idist) idxpathlist=idpath.split(tpath) if len(idxpathlist)>1: id_xpath=str1+idxpathlist[1] xpath_dict[str1].append(id_xpath) #print xpath_dict[str1] #print idxpathlist #else: #print ‘+++++++++++++++++++++++++++++++++++++++‘ #print idxpathlist,‘stop‘,len(idxpathlist) print ‘==============================================‘ cont=0 k=0 for i in xpath_dict: #print xpath_dict[i] for t in xpath_dict[i]: durll=‘‘ try: time.sleep(1) elem_text=driver.find_element_by_xpath(t).text driver.find_element_by_xpath(t).click() durll=driver.current_url headx=driver.window_handles #print headx if len(headx)!=1: driver.switch_to_window(headx[1]) durll= driver.current_url print ‘链接元素名:‘,elem_text print ‘页面名:‘,driver.title print ‘当前页面地址:\n‘,durll print t,‘\n‘ if ‘101.37.179.183‘ in durll: driver.close() driver.switch_to_window(headx[0]) else: k=1 break else: if driver.title !=web_title: print ‘链接元素名:‘,elem_text print ‘页面名:‘,driver.title print ‘当前页面地址:\n‘,durll print t,‘\n‘ driver.back() pass except: if k==1 or ‘localhost‘ in durll: pass else: try: print ‘动态首项xpath:‘,dict[i][0] elem=driver.find_element_by_xpath(xpath_dict[i][0]) ActionChains(driver).move_to_element(elem).perform() time.sleep(1) driver.find_element_by_xpath(t).click() print ‘当前动态页面地址为:‘,‘\n‘,driver.current_url print t,‘\n‘ if driver.title !=web_title: t1= ‘链接元素名:‘+elem_text t2= ‘页面名:‘+driver.title t3= ‘当前页面地址:‘+durll print t1,‘\n‘,t2,‘\n‘,t3,‘\n‘,t,‘\n‘ http_dict[durll].append(t1) http_dict[durll].append(t2) http_dict[durll].append(t3) driver.back() except(Exception): pass #print Exception cont+=1 print cont with open(‘E:/1/http.txt‘, ‘w‘) as handle: for t in http_dict: str2=t+‘‘+str(http_dict[t]) handle.writelines(str2) test2() print ‘结束‘ #driver.quit()