爬虫实战项目字体反爬

Posted 854594834-yt

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫实战项目字体反爬相关的知识,希望对你有一定的参考价值。

一、数字反爬

乱码的原因:style中加载了字体文件,这字体文件中包含着加密方法
大致思路: 1、请求页面
??    2、获取加密的字体库
??    3、解析字体库,获取字体间的映射关系
??    4、获取加密的字体,获取字体间映射关系,一一对应
技术图片
import requests
import re
import base64
from fontTools.ttLib import TTFont
from lxml import etree
import asyncio
import aiohttp
import time
headers = {
    "User-Agent": "Mozilla/5.0 (Linux; android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/86.0.4240.111 Mobile Safari/537.36",
    "Cookie": "footprints=eyJpdiI6IkhNbHRoM0lTdWdYWnJIcW9PZ1E0dkE9PSIsInZhbHVlIjoiWG5SZDVwWkpsekxNdERIRjZQeHgyY1JzWVFxWGpVMUFYMjV2NlNHVFBBaVRcL3F5akNYYU5nc0RNM2VzTUN2YWYiLCJtYWMiOiI2ODEyNTg5NjYxMDBkNzZjNzMwMWE0ZTkwM2FlZWU3MzVlNTNmMzE2ZGUzNzRiMzM2NmNlODg2NmNhOWMzOGRmIn0%3D; _ga=GA1.2.907203747.1604565923; _gid=GA1.2.1910542827.1604565923; __gads=ID=e84726596f3c4e4d-2245fc118dc4006a:T=1604565923:RT=1604565923:S=ALNI_MbZHQMLD_QOArjXJ8cXuhVtK_C-Zw; Hm_lvt_020fbaad6104bcddd1db12d6b78812f6=1604565922,1604567153; _gat_gtag_UA_75859356_3=1; XSRF-TOKEN=eyJpdiI6IlRBSTlUZDNNM0kyRE40ZlF2T2xjQWc9PSIsInZhbHVlIjoiK0dCRTBTRUN1dWNJcjZPclNHemwrYnhKQUlHRWhyWUJpZjJQb0JDZEliZXNpeFwvYjM1Y2VTdXl5c2xaaHlmWmkiLCJtYWMiOiIzMWIwNGQ2MWYyNDUwNWIwYzFmY2RjZmQ0NGYyOGNkYmRmMDBhMzg4YWVlNGRiOWE2MWNkMDZkZmEzNzg2NTk3In0%3D; glidedsky_session=eyJpdiI6InRuMm44ZlwvemRxdmJ1dEJpVXdpbSt3PT0iLCJ2YWx1ZSI6ImowQnVmeUx5NGRZMUxIcERJSHRSMW84bExib09rRzNhbDZFMERwYXYrRW9cL3JlTVljK0c5M05CSzJGN21YandZIiwibWFjIjoiOWUwZGRiZGI1OTBlMjZlMDY1MTAzNWVmOTI5Yjg5NWFhYmFmMTdjODdlYTg4ZDc2Nzg0ZWRiNjc1MDc3MWNkNyJ9; Hm_lpvt_020fbaad6104bcddd1db12d6b78812f6=16045680130,"
}

number_map = {
        ".notdef": -1,
        "zero": 0,
        "one": 1,
        "two": 2,
        "three": 3,
        "four": 4,
        "five": 5,
        "six": 6,
        "seven": 7,
        "eight": 8,
        "nine": 9
    }

async def get_page_number(url):
    async with aiohttp.ClientSession() as session:
        async with await session.get(url=url, headers=headers) as response:
            data = await response.read()
            number_list = etree.HTML(data).xpath(//div[@class="col-md-1"]/text())
            # 真是的数据
            number_list = [n.strip() for n in number_list]

            font_base = re.findall(base64,(.*?)) format, str(data, encoding=utf-8))[0]
            result = base64.b64decode(font_base)
            # 解码之后保存
            with open("字体文件.ttf", mode="wb") as f:
                f.write(result)

            # 使用TTFont打开字体文件并保存为xml文件以供阅读
            font = TTFont(字体文件.ttf)
            font.saveXML("font.xml")

            font_map = font.getGlyphOrder()
            lists = []
            for name in font_map:
                # getGlyphID()根据name获取GlyphID标签的id值
                # 将获取到的数减一,再添加到lists列表中去
                lists.append(font.getGlyphID(name) - 1)
            dicts = dict(zip(font_map, lists))

            # 得到映射关系
            for key in dicts.keys():
                # 转为字符串,方便替换
                dicts[str(number_map[key])] = str(dicts.pop(key))
            res = 0
            for n in number_list:
                res += eval(dicts[n[0]] + dicts[n[1]] + dicts[n[2]])
            print(res)
            return res
if __name__ == "__main__":
    url = "http://glidedsky.com/level/web/crawler-font-puzzle-1?page={}"
    res = 0
    async_list = []
    start_time = time.time()
    loop = asyncio.get_event_loop()
    for i in range(1, 1001):
        future = asyncio.ensure_future(get_page_number(url.format(i)))
        loop.run_until_complete(future)
        res += future.result()
    end_time = time.time()
    print(res)
    print(花费时间:{}s.format(int(end_time-start_time)))
实例一
技术图片
import requests
import re
from fontTools.ttLib import TTFont

headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

url = "https://book.qidian.com/info/1018027842"
response = requests.get(url=url, headers=headers)
response.encoding = utf-8

html_data = response.text

# src:; url(‘https://qidian.gtimg.com/qd_anti_spider/IiizhcHC.eot?‘) format
# 匹配字体文件下载地址
font_url = re.findall("; src: url(‘(.*?)‘) format", response.text)[1]

font_res = requests.get(url=font_url, headers=headers)
with open("woff文件.{}".format(woff), mode="wb") as fp:
    fp.write(font_res.content)
font = TTFont(woff文件.woff)
font.saveXML("font_woff.xml")


# #获取字体映射关系
font_cmap = font[cmap].getBestCmap()
f = {
    period: .,
    four: 4,
    three: 3,
    six: 6,
    zero: 0,
    one: 1,
    eight: 8,
    seven: 7,
    nine: 9,
    five: 5,
    two: 2}
# #更改映射
for key in font_cmap:
    font_cmap[key] = f[font_cmap[key]]

# 替换映射
for key in font_cmap:
    html_data = html_data.replace(&#+str(key)+;, str(font_cmap[key]))
with open("反扒成功.html", "w", encoding="utf-8") as fp:
    fp.write(html_data)
实例二

 

以上是关于爬虫实战项目字体反爬的主要内容,如果未能解决你的问题,请参考以下文章

爬虫反爬:JS逆向实战2

爬虫反爬:JS逆向实战2

k 近邻算法解决字体反爬手段|效果非常好

常见的一些反爬虫策略(下篇)-Java网络爬虫系统性学习与实战系列(10)

常见的一些反爬虫策略(下篇)-Java网络爬虫系统性学习与实战系列(10)

爬虫学习笔记(二十)—— 字体反爬