Python-爬虫-针对有frame框架的页面
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python-爬虫-针对有frame框架的页面相关的知识,希望对你有一定的参考价值。
有的页面会使用frame 框架,使用Selenium + PhantomJS 后并不会加载iframe 框架中的网页内容。iframe 框架相当于在页面中又加载了一个页面,需要使用Selenium 的 switch_to.frame() 方法加载(官网给的方法是switch_to_frame(),但是IDE提醒使用前面的方法替代该方法)。
比如:
driver.switch_to.frame(‘g_iframe‘)
html = driver.page_source
然后结合BeautifulSoup获取网页中信息。
这次我们爬取http://music.163.com/#/artist/album?id=101988&limit=120&offset=0页面中的专辑信息,比如,图片、网址及专辑名字。
""" http://music.163.com/#/artist/album?id=101988&limit=120&offset=0 爬取上述网址中的专辑信息 """ from selenium import webdriver from urllib.request import urlretrieve import os from bs4 import BeautifulSoup class DownloadInfo(): def __init__(self): self.url = ‘http://music.163.com/#/artist/album?id=101988&limit=120&offset=0‘ self.basePath = os.path.dirname(__file__) def makedir(self, name): path = os.path.join(self.basePath, name) isExist = os.path.exists(path) if not isExist: os.makedirs(path) print(‘The file is created now.‘) else: print(‘The file existed.‘) #切换到该目录下 os.chdir(path) return path def connect(self, url): driver = webdriver.PhantomJS() driver.get(url) print(‘success‘) return driver def getFileNames(self, path): pic_names = os.listdir(path) return pic_names def getInfo(self): driver = self.connect(self.url) driver.switch_to.frame(‘g_iframe‘) path = self.makedir(‘Infos‘) pic_names = self.getFileNames(path) imgs = driver.find_elements_by_xpath("//div[@class=‘u-cover u-cover-alb3‘]/img") titles = driver.find_elements_by_xpath("//li/p[@class=‘dec dec-1 f-thide2 f-pre‘]/a") dates = driver.find_elements_by_xpath("//span[@class=‘s-fc3‘]") count = 0 for img in imgs: album_name = titles[count].text count += 1 photo_name = album_name.replace(‘/‘, ‘‘) + ‘.jpg‘ print(photo_name) if photo_name in pic_names: print(‘图片已下载。‘) else: urlretrieve(img.get_attribute(‘src‘), photo_name) for title in titles: print(title.text) for date in dates: print(date.text) """ def getInfo(self): driver = self.connect(self.url) driver.switch_to.frame(‘g_iframe‘) html = driver.page_source path = self.makedir(‘Infos‘) pic_names = self.getFileNames(path) all_li = BeautifulSoup(html, ‘lxml‘).find(id=‘m-song-module‘).find_all(‘li‘) for li in all_li: album_img = li.find(‘img‘)[‘src‘] album_name = li.find(‘p‘, class_=‘dec‘)[‘title‘] album_date = li.find(‘span‘, class_=‘s-fc3‘).get_text() print(album_img) print(album_name) print(album_date) photo_name = album_name.replace(‘/‘, ‘‘) + ‘.jpg‘ if photo_name in pic_names: print(‘图片已下载。‘) else: urlretrieve(album_img, photo_name) """ if __name__ == ‘__main__‘: obj = DownloadInfo() obj.getInfo()
以上是关于Python-爬虫-针对有frame框架的页面的主要内容,如果未能解决你的问题,请参考以下文章