Python爬虫lxml解析实战
Posted 知我几分
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python爬虫lxml解析实战相关的知识,希望对你有一定的参考价值。
XPath常用规则 / 从当前节点选取直接子节点 // 从当前节点选取子孙节点 . 选取当前节点 .. 选取当前节点的父节点 @ 选取属性 * 通配符,选择所有元素节点与元素名 @* 选取所有属性 [@attrib] 选取具有给定属性的所有元素 [@attrib=\'value\'] 选取给定属性具有给定值的所有元素 [tag] 选取所有具有指定元素的直接子节点 [tag=\'text\'] 选取所有具有指定元素并且文本内容是text节点
"""爬取豆瓣网站的信息""" import requests from lxml import etree # 请求头设置 headers = { "User-Agentv": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36", "Referer": "https://movie.douban.com/", } url = "https://movie.douban.com/cinema/nowplaying/chongqing/" # 发起请求 rep = requests.get(url, headers=headers) text = rep.text # 转换成html格式 html = etree.HTML(text) # 找到子孙节点ul标签 ul = html.xpath("//ul[@class=\'lists\']")[0] # 当前ul下的所有li标签 lis = ul.xpath("./li") movies = [] # 循环每个li标签 for li in lis: # 直接@li标签的属性获取值 title = li.xpath("@data-title")[0] score = li.xpath("@data-score")[0] region = li.xpath("@data-region")[0] actors = li.xpath("@data-actors")[0] director = li.xpath("@data-director")[0] liimg = li.xpath(".//img/@src") movie = { "title": title, "score": score, "region": region, "actors": actors, "director": director, "liimg": liimg, } movies.append(movie) print(movies)
电影天堂
import requests from lxml import etree BASE_DOMAIN = "http://www.ygdy8.net" HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36", } def get_detail_urls(url): # 进入首页 rep = requests.get(url=url, headers=HEADERS) # 小坑(编码里面有非法字符,所以加ignore过滤掉) text = rep.content.decode("gbk", "ignore") html = etree.HTML(text) # 通过规律直接找table下的a标签属性 detail_urls = html.xpath("//table[@class=\'tbspan\']//a/@href") # map接受一个函数和list,并通过匿名函数lambda依次作用在list的每个元素上,得到一个新的list并返回 detail_urls = map(lambda url:BASE_DOMAIN+url, detail_urls) # 返回拼接完成的详情url return detail_urls def parse_detail_page(url): # 爬取详情页面信息 movie = {} res = requests.get(url, headers=HEADERS) text = res.content.decode("gbk") html = etree.HTML(text) title = html.xpath("//div[@class=\'title_all\']//font[@color=\'#07519a\']/text()")[0] movie["title"] = title zoomE = html.xpath("//div[@id=\'Zoom\']")[0] # 获取当前标签下的img imgs = zoomE.xpath(".//img/@src") # 列表切片法,避免取超过范围的数据报错 cover = imgs[0:1] movie["cover"] = cover poster = imgs[1:2] movie["poster"] = poster infos = zoomE.xpath(".//text()") def parse_info(info, rule): # 重复操作,提取出一个函数 return info.replace(rule, "").strip() for index, info in enumerate(infos): if info.startswith("◎年 代"): text = parse_info(info, "◎年 代") movie["year"] = text elif info.startswith("◎产 地"): text = parse_info(info, "◎产 地") movie["country"] = text elif info.startswith("◎类 别"): text = parse_info(info, "◎类 别") movie["category"] = text elif info.startswith("◎豆瓣评分"): text = parse_info(info, "◎豆瓣评分") movie["douban_rating"] = text elif info.startswith("◎片 长"): text = parse_info(info, "◎片 长") movie["duration"] = text elif info.startswith("◎导 演"): text = parse_info(info, "◎导 演") movie["director"] = text elif info.startswith("◎主 演"): text = parse_info(info, "◎主 演") actors = [text] for x in range(index+1, len(infos)): actor = infos[x].strip() if actor.startswith("◎标"): break actors.append(actor) movie["actors"] = actors elif info.startswith("◎简 介"): text = parse_info(info, "◎简 介") for x in range(index+1, len(infos)): profile = infos[x].strip() if profile.startswith("◎获奖情况"): break movie["profile"] = profile download_url = html.xpath("//td[@bgcolor=\'#fdfddf\']/a/@href") movie["download_url"] = download_url return movie def spider(): base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html" movies = [] # 设置爬取页面数量的url for i in range(1, 180): url = base_url.format(i) # 传递到第一个首页爬取详情页面链接 detail_urls = get_detail_urls(url) # 获取待爬取页面详情的url for detail_url in detail_urls: # 传递到详情页面爬取并获取爬取的详情数据 movie = parse_detail_page(detail_url) movies.append(movie) print(movies) if __name__ == \'__main__\': spider()
猫眼电影
"""猫眼电影爬取""" import requests from lxml import etree BASE_URL = "http://maoyan.com" HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36" } def get_detail_urls(url): # 具体获取详情url rep = requests.get(url=url, headers=HEADERS) html = etree.HTML(rep.text) # 找到详情url detail_urls = html.xpath("//dl//div[@class=\'movie-item\']/a/@href") detail_urls = map(lambda url: BASE_URL+url, detail_urls) return detail_urls def parse_detail_page(url): # 获取数据 movie = {} res = requests.get(url=url, headers=HEADERS) text = res.content.decode("utf-8") html = etree.HTML(text) name = html.xpath("//div[@class=\'movie-brief-container\']/h3/text()")[0] movie["name"] = name lis = html.xpath("//div[@class=\'movie-brief-container\']//li") for li in range(len(lis)): if li == 0: movie["plot"] = lis[li].xpath("./text()")[0] if li == 1: movie["country"] = lis[li].xpath("./text()")[0].split()[0] movie["duration"] = lis[li].xpath("./text()")[0].split()[1] if li == 2: try: movie["release_time"] = lis[li].xpath("./text()")[0] except Exception as e: continue avatar = html.xpath("//div[@class=\'avatar-shadow\']/img/@src") movie["avatar"] = avatar content = html.xpath("//div[@class=\'mod-content\']/span/text()")[0] movie["content"] = content container = html.xpath("//div[@class=\'comment-list-container\']/ul") for li in container: li_name = li.xpath(".//span[@class=\'name\']/text()") li_content = li.xpath(".//div[@class=\'comment-content\']/text()") livs = zip(li_name, li_content) movie["user"] = dict((name, value)for name, value in livs) return movie def spider(): # 获取url自行拼接 base_url = "http://maoyan.com/films?showType=1&offset={}" movies = [] for i in range(0, 31, 30): url = base_url.format(i) # 拿到url之后去找到详情页面url detail_urls = get_detail_urls(url) for detail_url in detail_urls: # 去获取详情页面数据 movie = parse_detail_page(detail_url) movies.append(movie) print(movie) print(movies) if __name__ == \'__main__\': spider()
腾讯招聘网
"""爬取腾讯招聘网找工作""" import requests from lxml import etree HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36", "Referer": "https://hr.tencent.com/position.php?keywords=python&lid=2218&tid=87&start=0" } BASE_URL = "https://hr.tencent.com/" def get_detail_urls(url): rep = requests.get(url=url, headers=HEADERS) html = etree.HTML(rep.text) detail_urls = html.xpath("//table//td[@class=\'l square\']/a/@href") detail_urls = map(lambda url: BASE_URL+url, detail_urls) return detail_urls def get_parse_detail(url): job_offers = {} res = requests.get(url=url, headers=HEADERS) html = etree.HTML(res.text) position = html.xpath("//table//td[@class=\'l2 bold size16\']/text()")[0] job_offers["position"] = position tds = html.xpath("//table//tr[@class=\'c bottomline\']/td/text()") for i in range(len(tds)): job_offers["location"] = tds[0] job_offers["category"] = tds[1] job_offers["recruits"] = tds[2] duties = html.xpath("//tr[3][contains(@class, \'c\')]//li/text()") job_offers["duties"] = duties claim = html.xpath("//tr[4][contains(@class, \'c\')]//li/text()") job_offers["claim"] = claim return job_offers def spider(): base_url = "https://hr.tencent.com/position.php?keywords=python&lid=2218&tid=87&start={}#a" squres = [] for i in range(0, 340, 10): url = base_url.format(i) detail_urls = get_detail_urls(url) for detail_url in detail_urls: squre = get_parse_detail(detail_url) squres.append(squre) print(squre) if __name__ == \'__main__\': spider()
可参考博客链接(我就懒得写了):http://www.cnblogs.com/zhangxinqi/p/9210211.html#_label11
以上是关于Python爬虫lxml解析实战的主要内容,如果未能解决你的问题,请参考以下文章
Python3网络爬虫实战-3数据库的安装:MySQLMongoDBRedis
Python爬虫:通过爬取CSDN博客信息,学习lxml库与XPath语法