爬虫实战项目字体反爬
Posted 854594834-yt
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫实战项目字体反爬相关的知识,希望对你有一定的参考价值。
一、数字反爬
乱码的原因:style中加载了字体文件,这字体文件中包含着加密方法 大致思路: 1、请求页面 ?? 2、获取加密的字体库 ?? 3、解析字体库,获取字体间的映射关系 ?? 4、获取加密的字体,获取字体间映射关系,一一对应
import requests import re import base64 from fontTools.ttLib import TTFont from lxml import etree import asyncio import aiohttp import time headers = { "User-Agent": "Mozilla/5.0 (Linux; android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/86.0.4240.111 Mobile Safari/537.36", "Cookie": "footprints=eyJpdiI6IkhNbHRoM0lTdWdYWnJIcW9PZ1E0dkE9PSIsInZhbHVlIjoiWG5SZDVwWkpsekxNdERIRjZQeHgyY1JzWVFxWGpVMUFYMjV2NlNHVFBBaVRcL3F5akNYYU5nc0RNM2VzTUN2YWYiLCJtYWMiOiI2ODEyNTg5NjYxMDBkNzZjNzMwMWE0ZTkwM2FlZWU3MzVlNTNmMzE2ZGUzNzRiMzM2NmNlODg2NmNhOWMzOGRmIn0%3D; _ga=GA1.2.907203747.1604565923; _gid=GA1.2.1910542827.1604565923; __gads=ID=e84726596f3c4e4d-2245fc118dc4006a:T=1604565923:RT=1604565923:S=ALNI_MbZHQMLD_QOArjXJ8cXuhVtK_C-Zw; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1604565922,1604567153; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IlRBSTlUZDNNM0kyRE40ZlF2T2xjQWc9PSIsInZhbHVlIjoiK0dCRTBTRUN1dWNJcjZPclNHemwrYnhKQUlHRWhyWUJpZjJQb0JDZEliZXNpeFwvYjM1Y2VTdXl5c2xaaHlmWmkiLCJtYWMiOiIzMWIwNGQ2MWYyNDUwNWIwYzFmY2RjZmQ0NGYyOGNkYmRmMDBhMzg4YWVlNGRiOWE2MWNkMDZkZmEzNzg2NTk3In0%3D; glidedsky_session=eyJpdiI6InRuMm44ZlwvemRxdmJ1dEJpVXdpbSt3PT0iLCJ2YWx1ZSI6ImowQnVmeUx5NGRZMUxIcERJSHRSMW84bExib09rRzNhbDZFMERwYXYrRW9cL3JlTVljK0c5M05CSzJGN21YandZIiwibWFjIjoiOWUwZGRiZGI1OTBlMjZlMDY1MTAzNWVmOTI5Yjg5NWFhYmFmMTdjODdlYTg4ZDc2Nzg0ZWRiNjc1MDc3MWNkNyJ9; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=16045680130," } number_map = { ".notdef": -1, "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9 } async def get_page_number(url): async with aiohttp.ClientSession() as session: async with await session.get(url=url, headers=headers) as response: data = await response.read() number_list = etree.HTML(data).xpath(‘//div[@class="col-md-1"]/text()‘) # 真是的数据 number_list = [n.strip() for n in number_list] font_base = re.findall(‘base64,(.*?)) format‘, str(data, encoding=‘utf-8‘))[0] result = base64.b64decode(font_base) # 解码之后保存 with open("字体文件.ttf", mode="wb") as f: f.write(result) # 使用TTFont打开字体文件并保存为xml文件以供阅读 font = TTFont(‘字体文件.ttf‘) font.saveXML("font.xml") font_map = font.getGlyphOrder() lists = [] for name in font_map: # getGlyphID()根据name获取GlyphID标签的id值 # 将获取到的数减一,再添加到lists列表中去 lists.append(font.getGlyphID(name) - 1) dicts = dict(zip(font_map, lists)) # 得到映射关系 for key in dicts.keys(): # 转为字符串,方便替换 dicts[str(number_map[key])] = str(dicts.pop(key)) res = 0 for n in number_list: res += eval(dicts[n[0]] + dicts[n[1]] + dicts[n[2]]) print(res) return res if __name__ == "__main__": url = "http://glidedsky.com/level/web/crawler-font-puzzle-1?page={}" res = 0 async_list = [] start_time = time.time() loop = asyncio.get_event_loop() for i in range(1, 1001): future = asyncio.ensure_future(get_page_number(url.format(i))) loop.run_until_complete(future) res += future.result() end_time = time.time() print(res) print(‘花费时间:{}s‘.format(int(end_time-start_time)))
import requests import re from fontTools.ttLib import TTFont headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36" } url = "https://book.qidian.com/info/1018027842" response = requests.get(url=url, headers=headers) response.encoding = ‘utf-8‘ html_data = response.text # src:; url(‘https://qidian.gtimg.com/qd_anti_spider/IiizhcHC.eot?‘) format # 匹配字体文件下载地址 font_url = re.findall("; src: url(‘(.*?)‘) format", response.text)[1] font_res = requests.get(url=font_url, headers=headers) with open("woff文件.{}".format(‘woff‘), mode="wb") as fp: fp.write(font_res.content) font = TTFont(‘woff文件.woff‘) font.saveXML("font_woff.xml") # #获取字体映射关系 font_cmap = font[‘cmap‘].getBestCmap() f = { ‘period‘: ‘.‘, ‘four‘: 4, ‘three‘: 3, ‘six‘: 6, ‘zero‘: 0, ‘one‘: 1, ‘eight‘: 8, ‘seven‘: 7, ‘nine‘: 9, ‘five‘: 5, ‘two‘: 2} # #更改映射 for key in font_cmap: font_cmap[key] = f[font_cmap[key]] # 替换映射 for key in font_cmap: html_data = html_data.replace(‘&#‘+str(key)+‘;‘, str(font_cmap[key])) with open("反扒成功.html", "w", encoding="utf-8") as fp: fp.write(html_data)
以上是关于爬虫实战项目字体反爬的主要内容,如果未能解决你的问题,请参考以下文章
常见的一些反爬虫策略(下篇)-Java网络爬虫系统性学习与实战系列(10)