selenium 爬boss
Posted kend
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了selenium 爬boss相关的知识,希望对你有一定的参考价值。
# 有问题 from selenium import webdriver import time from lxml import etree class LagouSpider(object): driver_path = r"G:\Crawler and Data\chromedriver.exe" def __init__(self): self.driver = webdriver.Chrome(executable_path=self.driver_path) self.url = "https://www.zhipin.com/job_detail/?query=python&city=101010100&industry=&position=" self.positions = [] self.position_dict = self.detail_url_list = [] def run(self): # 访问首页 self.driver.get(self.url) # 获取页面信息 # page_source可以获取页面的所有数据,包括每个职位的链接 source= self.driver.page_source self.parse_list_page(source) def parse_list_page(self,source): # 每个职位的链接 tree = etree.html(source) # 获取职位的链接 ****** li_list = tree.xpath("//div[@class=‘job-box‘]/div[@class=‘job-list‘]/ul/li") for li in li_list: detail_url = li.xpath(‘.//div[@class="info-primary"]/h3/a/@href‘)[0] detail_url = "https://www.zhipin.com"+detail_url print(detail_url) self.detail_url_list.append(detail_url) title = li.xpath(‘.//div[@class="info-primary"]/h3/a/div[@class="job-title"]/text()‘)[0] salary = li.xpath(‘.//div[@class="info-primary"]/h3/a/span[@class="red"]/text()‘)[0] company = li.xpath(‘.//div[@class="info-company"]//h3/a/text()‘)[0] self.position_dict["title"]=title self.position_dict["salary"]=salary self.position_dict["company"]=company self.detail_page(detail_url) # break def detail_page(self,url): for url in self.detail_url_list: # self.driver.get(url) # 直接访问这个url self.driver.execute_script(‘window.open("%s")‘%url) # 新打开一个窗口 self.driver.switch_to.window(self.driver.window_handles[1]) # 切换到新窗口 source = self.driver.page_source tree = etree.HTML(source) desc = tree.xpath("//div[@id=‘main‘]/div[3]/div/div[2]/div[2]/div[1]/div") # 获取一个标签(含有其他标签)下所有的文本 desc_text = desc[0].xpath(‘string()‘).strip() self.position_dict[‘desc_text‘] = desc_text print(self.position_dict) time.sleep(2) self.driver.close() # 关闭页面 self.driver.switch_to.window(self.driver.window_handles[0]) # 切换到新窗口 if __name__ == ‘__main__‘: spider = LagouSpider() spider.run()
以上是关于selenium 爬boss的主要内容,如果未能解决你的问题,请参考以下文章