爬虫--selenium

Posted zhuifeng-mayi

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫--selenium相关的知识,希望对你有一定的参考价值。

什么是selenium?

技术分享图片

基本使用

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support. wait import WebDriverWait
browser = webdriver.Chrome() # Chrom驱动
try:
    browser.get("https://www.baidu.com") # 输入 www.baidu.com 网址
    input = browser.find_element_by_id("kw") # 找到一个 kw 元素赋值为 input
    input.send_keys("Python") # 在网页里敲入Python
    input.send_keys(Keys.ENTER) # 在网页里敲入回车
    wait=WebDriverWait(browser,10) # 等待10秒
    wait.until(EC.presence_of_element_located((By.ID,"content_left"))) # 等待content_left元素被加载出来
    print(browser.current_url) # 打印目前的url
    print(browser.get_cookies()) # 打印cookies
    # print(browser.page_source) # 打印 网页源代码
finally:
    browser.close() # 关闭浏览器
技术分享图片
https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=0&rsv_idx=1&tn=baidu&wd=Python&rsv_pq=c618fa5900004b25&rsv_t=c25fWJbEN2wl13gOxRoocQDIAUMPaoguAnEu9Rg4KGX4uoRC0lynG5EjFGY&rqlang=cn&rsv_enter=1&rsv_sug3=6&rsv_sug2=0&inputT=162&rsv_sug4=162
[{path: /, value: 1468_21104_18559_26350_22075, secure: False, domain: .baidu.com, httpOnly: False, name: H_PS_PSSID}, {path: /, value: 3FFEC0A0709997465509BC1AFB51F757:FG=1, secure: False, domain: .baidu.com, expiry: 3685018278.537968, httpOnly: False, name: BAIDUID}, {path: /, value: 3FFEC0A0709997465509BC1AFB51F757, secure: False, domain: .baidu.com, expiry: 3685018278.538044, httpOnly: False, name: BIDUPSID}, {path: /, value: 1537534637, secure: False, domain: .baidu.com, expiry: 3685018278.538084, httpOnly: False, name: PSTM}, {path: /, value: 0, secure: False, domain: www.baidu.com, expiry: 2483614633.353963, httpOnly: False, name: delPer}, {path: /, value: 0, secure: False, domain: www.baidu.com, httpOnly: False, name: BD_HOME}, {path: /, value: B490B5EBF6F3CD402E515D22BCDA1598, secure: False, domain: .baidu.com, expiry: 1537621032.332359, httpOnly: False, name: BDORZ}, {path: /, value: 12314353, secure: False, domain: www.baidu.com, expiry: 1538398632, httpOnly: False, name: BD_UPN}, {path: /, value: 1, secure: False, domain: www.baidu.com, httpOnly: False, name: BD_CK_SAM}, {path: /, value: 1, secure: False, domain: .baidu.com, httpOnly: False, name: PSINO}, {path: /, value: bf67l36p%2FgarFggwpeficZXTG5zE%2FhBZEp2ev5JvDSo8venU134svju%2FJL4, secure: False, domain: www.baidu.com, expiry: 1537537225, httpOnly: False, name: H_PS_645EC}]
打印后的结果为:

生命浏览器对象

from selenium import webdriver
browser_1 = webdriver.Chrome()
browser_2 = webdriver.Firefox()
browser_3 = webdriver.Edge()
browser_4 = webdriver.PhantomJS()
browser_5 = webdriver.Safari()

访问页面

from selenium import webdriver

browser = webdriver.Chrome()
browser.get("http://www.tabao.com")
print(browser.get_cookie)
browser.close()
技术分享图片
<bound method WebDriver.get_cookie of <selenium.webdriver.chrome.webdriver.WebDriver (session="65ad512a5c81e7d9f6f3bd81a4ba3495")>>
打印后的结果为:

查找元素

单个元素

from selenium import webdriver

browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
input_first = browser.find_element_by_id("q")
input_second = browser.find_element_by_xpath(//*[@id="q"])
input_third = browser.find_element_by_css_selector("#q")
print(input_first)
print("----------------------------------------------------------------")
print(input_second)
print("----------------------------------------------------------------")
print(input_third)
browser.close()
技术分享图片
<selenium.webdriver.remote.webelement.WebElement (session="72af913bda52ce0848486b08ba93d3a1", element="0.3093750540466209-1")>
----------------------------------------------------------------
<selenium.webdriver.remote.webelement.WebElement (session="72af913bda52ce0848486b08ba93d3a1", element="0.3093750540466209-1")>
----------------------------------------------------------------
<selenium.webdriver.remote.webelement.WebElement (session="72af913bda52ce0848486b08ba93d3a1", element="0.3093750540466209-1")>
打印的结果为:

比较通用的查找方式

from selenium import webdriver

browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
input_first = browser.find_element(By.ID,q)
print(input_first)
browser.close()

运行的结果与上面一样!

多个元素

from selenium import webdriver

browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
input_first = browser.find_elements_by_css_selector(".service-bd li")
print(input_first)
browser.close()
技术分享图片
[<selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-1")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-2")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-3")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-4")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-5")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-6")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-7")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-8")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-9")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-10")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-11")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-12")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-13")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-14")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-15")>, <selenium.webdriver.remote.webelement.WebElement (session="d52b2ef4ead695decaf705a0c331eb06", element="0.7934950105071805-16")>]
打印后的结果为:

元素交互操作

对获取的元素调用交互方法

from selenium import webdriver
import time

browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
input = browser.find_element_by_id("q")
input.send_keys(iPhone)
time.sleep(1)
input.clear()
input.send_keys("iPad")
button = browser.find_element_by_class_name(btn-search)
button.click()
browser.close()

交互动作

将动作附加到动作链中串行执行

from selenium import webdriver
from selenium.webdriver import ActionChains

browser = webdriver.Chrome()
url =http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable
browser.get(url) # 请求url
browser.switch_to.frame(iframeResult) # 切换到 frame
source = browser.find_elements_by_css_selector(#draggable)
target = browser.find_elements_by_css_selector(#droppable)
actions = ActionChains(browser) # 声明动作链
actions.drag_and_drop(source,target)
actions.perform() # 执行这个动作

执行javascript

from selenium import webdriver

browser = webdriver.Chrome()
browser.get(https://www.zhihu.com/explore)
browser.execute_script(window.scrollTo(0,document.body.scrollHeight)) # 滑动栏拖拽到最下方
browser.execute_script(alert("To Bottom"))

执行后的结果为:

技术分享图片

获取元素信息

获取属性

from selenium import webdriver
from selenium.webdriver import ActionChains

browser = webdriver.Chrome()
browser.get("https://www.zhihu.com/explore")
logo = browser.find_element_by_id(zh-top-link-logo)
print(logo)
print(logo.get_attribute(class))
技术分享图片
<selenium.webdriver.remote.webelement.WebElement (session="ef2d80c82e37098c4c702fe5c0e2df31", element="0.9948931372437708-1")>
zu-top-link-logo
打印后的结果为:

获取文本值

from selenium import webdriver
from selenium.webdriver import ActionChains

browser = webdriver.Chrome()
browser.get("https://www.zhihu.com/explore")
input = browser.find_element_by_class_name(post-link)
print(input.text) # 打印获取的文本信息
技术分享图片
《红色警戒》的世界:没有希特勒 二战死了1亿人
打印后的结果为:

获取ID、位置、标签名、大小

from selenium import webdriver
from selenium.webdriver import ActionChains

browser = webdriver.Chrome()
browser.get("https://www.zhihu.com/explore")
input = browser.find_element_by_class_name(post-link)
print(input.text) # 打印获取的文本信息
print(input.id) # 获取id
print(input.location) # 获取位置
print(input.tag_name) # 获取标签名
print(input.size) # 获取大小
browser.close()
技术分享图片
《红色警戒》的世界:没有希特勒 二战死了1亿人
0.22091173377675544-1
{y: 304, x: 32}
a
{height: 16, width: 306}
打印后的结果为:

Frame

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
browser = webdriver.Chrome()
url=http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable
browser.get(url)
browser.switch_to.frame(iframeResult)
source =browser.find_element_by_css_selector("#draggable")
print(source)
try:
    logo=browser.find_element_by_class_name(logo)
except NoSuchElementException:
    print("NO LOGO")
browser.switch_to.parent_frame()
logo = browser.find_element_by_class_name(logo)
print(logo)
print(logo.text)
技术分享图片
<selenium.webdriver.remote.webelement.WebElement (session="ecfecc0e705df8976f5241726b66e273", element="0.27322378119978463-1")>
NO LOGO
<selenium.webdriver.remote.webelement.WebElement (session="ecfecc0e705df8976f5241726b66e273", element="0.8128333237150809-2")>
RUNOOB.COM
打印后的结果为:

等待

隐式等待

当使用了隐式等待执行测试的时候,如果WebDriver没有在DOM中找到元素,将继续等待,超出设定时间后则抛出找不到元素的异常,换句话说,当查找元素或元素并没有立即出现的时候,隐式等待将等待一段时间再查找DOM,默认的时间是0。

from selenium import webdriver

browser = webdriver.Chrome()
browser.implicitly_wait(10)
browser.get("https://www.zhihu.com/explore")
input = browser.find_element_by_class_name(zu-top-add-question)
print(input)
技术分享图片
<selenium.webdriver.remote.webelement.WebElement (session="87a2e958b9a3b58334e8c2ec76d0419e", element="0.014192877625801792-1")>
打印后的结果为:

显示等待

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

browser = webdriver.Chrome()
browser.get("http://www.taobao.com/")
wait = WebDriverWait(browser,10)
input = wait.until(EC.presence_of_all_elements_located((By.ID,q)))
button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,btn-search)))
print(input,button)
技术分享图片
Traceback (most recent call last):
  File "C:/Users/Administrator/Desktop/正则表达式/正则表达式.py", line 10, in <module>
    button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,btn-search)))
  File "C:UsersAdministratorDesktop正则表达式venvlibsite-packagesseleniumwebdriversupportwait.py", line 80, in until
    raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message: 
不知什么原因,总是超时

·title_is 标题是某内容
·title_contains 标题包含某内容
·presence_of_element located 元素加载出,传入定位元组,如(By.lD,p)
·visibility_of_element located 元素可见,传入定位元组
·visibility_of可见,传入元素对象
·presence_of_all_elements_located 所有元素加载出
·text_to_be_present_in_element某个元素文本龟含某文字
.text to_be_present_in_element_value 某个元素值包含某文字
·frame_to _be_available_and_switch_ to _it framea载并切换
·invisibility_of_element_located 元素不可见
·element_to_be_clickable 元素可点击
·staleness_of 判断一个元素是否仍在DOM,可判断页面是否已经刷新
·element to_be_selected 元素可选择,传元素对象
·element located_to_be_selected 元素可选择,传入定位元组
·element selection_state_to_be 传入元素对象以及状态,相等返回True,否则返回False
·element located_selection_state_to_be 传入定位元组以及状态,相等返回True,否则返回False
·alertis_present 是否出现Alert

前进后退

from selenium import webdriver
import time

browser = webdriver.Chrome()
browser.get("https://www.taobao.com/")
browser.get("https://www.baidu.com/")
browser.get("https://www.jingdong.com/")
browser.back()
time.sleep(1)
browser.forward()
browser.close()

Cookies

from selenium import webdriver

browser = webdriver.Chrome()
browser.get("http://www.zhihu.com/explore")
print(browser.get_cookies())
browser.add_cookie({"name":"name","domain":"www,zhihu.com","value":"germey"})
print(browser.get_cookies())
browser.delete_all_cookies()
print(browser.get_cookies())
技术分享图片
[{path: /, httpOnly: False, name: l_n_c, domain: .zhihu.com, secure: False, value: 1}, {path: /, httpOnly: False, name: tgw_l7_route, domain: www.zhihu.com, expiry: 1537604591.643548, secure: False, value: 156dfd931a77f9586c0da07030f2df36}, {path: /, httpOnly: False, name: d_c0, domain: .zhihu.com, expiry: 1632211696.283527, secure: False, value: "AFCk5k4oQA6PTrVJvEIdM1iDREt1Ez3H0lw=|1537603702"}, {path: /, httpOnly: False, name: _xsrf, domain: .zhihu.com, expiry: 1615363691.643627, secure: False, value: ZnQElKxeWBcoeNFASCTcgdhk56NJ83hf}, {path: /, httpOnly: False, name: __utmb, domain: .zhihu.com, expiry: 1537605497, secure: False, value: 51854390.0.10.1537603697}, {path: /, httpOnly: False, name: q_c1, domain: .zhihu.com, expiry: 1632211692.325099, secure: False, value: 104b20902a9f4159b0c1811e7dd3959c|1537603698000|1537603698000}, {path: /, httpOnly: False, name: r_cap_id, domain: .zhihu.com, expiry: 1540195692.325146, secure: False, value: "MWIwYjU3YmI1OWVkNGEwYmJhZGM0MTY5ZDQzZWU3MmQ=|1537603698|85c9986946afde9f1823ce067dc29aa2ea19d5f3"}, {path: /, httpOnly: False, name: cap_id, domain: .zhihu.com, expiry: 1540195692.325191, secure: False, value: "OTVhZmIwZDkzYzliNDAzNWI4ZTJiNWM2NzY0NWFmMjQ=|1537603698|67c5e766f10d39421a3d3afc84d45dfed316ae18"}, {path: /, httpOnly: False, name: l_cap_id, domain: .zhihu.com, expiry: 1540195692.325236, secure: False, value: "MDUzODliMmYwN2VlNDU1YjkwNGU0MjEwZDU0OTdkMjI=|1537603698|e44d90334d2319d2934c4b5cccb4d8d5a549247d"}, {path: /, httpOnly: False, name: n_c, domain: .zhihu.com, secure: False, value: 1}, {path: /, httpOnly: False, name: _zap, domain: .zhihu.com, expiry: 1600675697, secure: False, value: c7dbed92-9690-47b9-886e-d5539b1f74b8}, {path: /, httpOnly: False, name: __utma, domain: .zhihu.com, expiry: 1600675697, secure: False, value: 51854390.1486460487.1537603697.1537603697.1537603697.1}, {path: /, httpOnly: False, name: __utmc, domain: .zhihu.com, secure: False, value: 51854390}, {path: /, httpOnly: False, name: __utmz, domain: .zhihu.com, expiry: 1553371697, secure: False, value: 51854390.1537603697.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)}, {path: /, httpOnly: False, name: __utmv, domain: .zhihu.com, expiry: 1600675697, secure: False, value: 51854390.000--|3=entry_date=20180922=1}]
[{path: /, httpOnly: False, name: l_n_c, domain: .zhihu.com, secure: False, value: 1}, {path: /, httpOnly: False, name: tgw_l7_route, domain: www.zhihu.com, expiry: 1537604591.643548, secure: False, value: 156dfd931a77f9586c0da07030f2df36}, {path: /, httpOnly: False, name: d_c0, domain: .zhihu.com, expiry: 1632211696.283527, secure: False, value: "AFCk5k4oQA6PTrVJvEIdM1iDREt1Ez3H0lw=|1537603702"}, {path: /, httpOnly: False, name: _xsrf, domain: .zhihu.com, expiry: 1615363691.643627, secure: False, value: ZnQElKxeWBcoeNFASCTcgdhk56NJ83hf}, {path: /, httpOnly: False, name: __utmb, domain: .zhihu.com, expiry: 1537605497, secure: False, value: 51854390.0.10.1537603697}, {path: /, httpOnly: False, name: q_c1, domain: .zhihu.com, expiry: 1632211692.325099, secure: False, value: 104b20902a9f4159b0c1811e7dd3959c|1537603698000|1537603698000}, {path: /, httpOnly: False, name: r_cap_id, domain: .zhihu.com, expiry: 1540195692.325146, secure: False, value: "MWIwYjU3YmI1OWVkNGEwYmJhZGM0MTY5ZDQzZWU3MmQ=|1537603698|85c9986946afde9f1823ce067dc29aa2ea19d5f3"}, {path: /, httpOnly: False, name: cap_id, domain: .zhihu.com, expiry: 1540195692.325191, secure: False, value: "OTVhZmIwZDkzYzliNDAzNWI4ZTJiNWM2NzY0NWFmMjQ=|1537603698|67c5e766f10d39421a3d3afc84d45dfed316ae18"}, {path: /, httpOnly: False, name: l_cap_id, domain: .zhihu.com, expiry: 1540195692.325236, secure: False, value: "MDUzODliMmYwN2VlNDU1YjkwNGU0MjEwZDU0OTdkMjI=|1537603698|e44d90334d2319d2934c4b5cccb4d8d5a549247d"}, {path: /, httpOnly: False, name: n_c, domain: .zhihu.com, secure: False, value: 1}, {path: /, httpOnly: False, name: _zap, domain: .zhihu.com, expiry: 1600675697, secure: False, value: c7dbed92-9690-47b9-886e-d5539b1f74b8}, {path: /, httpOnly: False, name: __utma, domain: .zhihu.com, expiry: 1600675697, secure: False, value: 51854390.1486460487.1537603697.1537603697.1537603697.1}, {path: /, httpOnly: False, name: __utmc, domain: .zhihu.com, secure: False, value: 51854390}, {path: /, httpOnly: False, name: __utmz, domain: .zhihu.com, expiry: 1553371697, secure: False, value: 51854390.1537603697.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)}, {path: /, httpOnly: False, name: __utmv, domain: .zhihu.com, expiry: 1600675697, secure: False, value: 51854390.000--|3=entry_date=20180922=1}]
[]
打印后的结果为:

选项卡管理

from selenium import webdriver
import time

browser = webdriver.Chrome()
browser.get("http://www.baidu.com")
browser.execute_script("window.open()") # 打开新的选项卡
print(browser.window_handles) # 返回所有窗口的一些引用
browser.switch_to_window(browser.window_handles[1]) # 切换到第二个选项卡
browser.get("https://www.taobao.com")
time.sleep(1)
browser.switch_to_window(browser.window_handles[0]) # 切换到第一个选项卡
browser.get("https://www.taobao.com")
browser.close()
技术分享图片
[CDwindow-65E31D9BF9FDC0B83D2821ABB85DB273, CDwindow-C2A8A67F87828D4AEA0A9D391203121E]
打印的结果为:

异常处理

from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException

browser = webdriver.Chrome()
browser.get("http://www.baidu.com")

try:
    browser.get("https://www.baidu.com")
except TimeoutException:
    print("TIME OUT")
try:
    browser.find_element_by_id("name")
except NoSuchElementException:
    print("NO ELEMENT")
finally:
    browser.close()
技术分享图片
NO ELEMENT
打印后的结果为:

 

















以上是关于爬虫--selenium的主要内容,如果未能解决你的问题,请参考以下文章

Python爬虫编程思想(99):使用Selenium执行JavaScript代码

scrapy按顺序启动多个爬虫代码片段(python3)

爬虫请求库——selenium

爬虫----selenium模块

scrapy主动退出爬虫的代码片段(python3)

3爬虫之selenium模块