爬虫521错误(又是一次和可爱的前端vs的故事)
Posted yhll
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫521错误(又是一次和可爱的前端vs的故事)相关的知识,希望对你有一定的参考价值。
起因:
今天突然想重构一下代理池,并且想扩充一下代理,所以就想着爬点代理IP,然后就有了下面的故事
一上来先进行了一顿操作:
def get_xxdaili(url): headers = { ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/64.0.3282.119 Safari/537.36‘, "Host": ‘www.66ip.cn‘, "Referer": ‘http://www.66ip.cn/index.html‘, "Upgrade-Insecure-Requests": ‘1‘, } res = requests.get(url=url, headers=headers)
然后看都没看状态码直接xpath取:过了一会黑人问号??????,喵喵喵,为啥是空,点开源代码,啥都有,哦,可能是xpath写的有问题,又进行了微调,还是取不到,突然感觉这个网站好骚,怎么就取不到呢.有重新分析了一次源代码与Network, 然后看了眼返回状态码,521,进过分析以后得出了问题的原因:
发生 521 错误是因为源服务器拒绝来自 Cloudflare 的连接。更具体地说,Cloudflare 尝试通过端口 80 或 443 连接到您的源服务器,但却收到连接被拒绝的错误。
我发现cookie的参数很有问题,所以估计是cookie的问题(之前没遇到521,所以一开始也不清楚哪里的问题),网上整理了一下资料,原来是进行了cookie加密(js),所以接下来思路是很清晰了,就是分析js,然后拿到加密后的数据.所以我直接拿到相应信息
<script>var x="[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@@[email protected]@[email protected]@[email protected]@[email protected]@@@@@[email protected]@[email protected]@@@[email protected]@[email protected]@@[email protected]@@[email protected]@[email protected]@@[email protected]@[email protected]@@[email protected]@[email protected]@[email protected]@[email protected]@@@[email protected]@[email protected]@@[email protected]@[email protected]@[email protected]@@[email protected]@[email protected]@@[email protected][email protected]@[email protected]@@@@[email protected]@@[email protected]@@[email protected]@[email protected]@[email protected]@@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@@@location".replace(/@*$/,"").split("@"),y="4 49=7(){4l(‘58.a=58.n+58.36.3g(/[\\\\?|&]39-47/,\\\\‘\\\\‘)‘,42);4m.55=‘40=2c.1m|9|‘+(7(){4 49=[[(+!~~[])]+[-~{}-~{}],[f+f],[(+!~~[])]+((-~-~[])*[-~-~[]]+[]+[[]][9]),[(+!~~[])]+[-~{}+(-~-~[]<<-~!{})],[(+!~~[])]+[~~{}],[(+!~~[])]+[-~[]+(-~!{}+[(-~~~{}<<-~~~{})])/[(-~~~{}<<-~~~{})]],[(+!~~[])]+((-~-~[]^(+!~~[]))+[[]][9]),[~~{}],((-~-~[]^(+!~~[]))+[[]][9]),[-~[]+(-~!{}+[(-~~~{}<<-~~~{})])/[(-~~~{}<<-~~~{})]],[(+!~~[])]+[(+!~~[])],[-~{}-~{}],[(+!~~[])]+[f+f],[(-~-~[]^(+!~~[]))+(-~-~[]^(+!~~[]))+(-~-~[]^(+!~~[]))],[(+!~~[])],((-~-~[])*[-~-~[]]+[]+[[]][9]),[-~{}+(-~-~[]<<-~!{})],[(+!~~[])]+[(-~{}<<(-~-~[]^(+!~~[])))],[(-~{}<<(-~-~[]^(+!~~[])))]],c=d(49.3j);2k(4 45=9;45<49.3j;45++){c[49[45]]=[‘%‘,‘15‘,‘3f‘,[-~{}+(-~-~[]<<-~!{})]+[{}+[[]][9]][9].22(2g),‘50‘,((-~-~[]^(+!~~[]))+[[]][9]),[-~{}-~{}],‘2b‘,[-~{}-~{}],(+[~~[], ~~[]]+[]).22((+[]))+[~~{}]+([-~{}-~{}]/(+![])+[]+[[]][9]).22(-~-~[]+(-~-~[])*[-~-~[]])+[!{}+[]+[[]][9]][9].22(-~{}-~{}),(2m.27+[]).22(-~((-~-~[]^(+!~~[])))-~((-~-~[]^(+!~~[])))),‘%‘,‘11%‘,((f)/(+[])+[]).22(~~{})+({}+[]+[]).22([(+!~~[])]+[~~{}]),[-~[]+(-~!{}+[(-~~~{}<<-~~~{})])/[(-~~~{}<<-~~~{})]]+[(+!~~[])],‘12‘,[!-{}+[]+[]][9].22((-~~~{}<<-~~~{}))+[-~{}+(-~-~[]<<-~!{})],‘28‘,‘38‘][45]};l c.30(‘‘)})()+‘;25=4b, 1-6-54 32:1:1e j;4k=/;‘};2n((7(){10{l !!2m.3n;}4a(19){l 31;}})()){4m.3n(‘3d‘,49,31)}18{4m.3(‘1a‘,49)}",f=function(x,y){var a=0,b=0,c=0;x=x.split("");y=y||99;while((a=x.shift())&&(b=a.charCodeAt(0)-77.5))c=(Math.abs(b)<13?(b+48.5):parseInt(a,36))+y*c;return c},z=f(y.match(/\\w/g).sort(function(x,y){return f(x)-f(y)}).pop());while(z++)try{eval(y.replace(/\\b\\w+\\b/g, function(y){return x[f(y,z)-1]||("_"+y)}));break}catch(_){}</script>
经过js优化:
<script>var x = "@@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@@@[email protected]@@[email protected]@@[email protected]@[email protected]@@@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@@@[email protected]@[email protected]@@@@@[email protected]@@[email protected]@[email protected]@[email protected]@[email protected]@@[email protected]@[email protected]@[email protected]@@[email protected]@@@[email protected]@@@[email protected]@[email protected]@@@[email protected]@@@@[email protected]@[email protected]@[email protected]@@@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@@[email protected]".replace(/@*$/, "").split("@"), y = "402 241=213(){1002(‘11.204=11.30+11.300.34(/[\\\\?|&]201-1003/,\\\\‘\\\\‘)‘,342);22.111=‘403=410.422|23|‘+(213(){402 241=[((+!+{})-~(+!+{})-~[-~(+[])+(-~[]<<-~(+!+{}))]+[]+[]),(-~[]+[[]][23])+([-~[]-~[]]*((-~{}+[-~(+!+{})]>>-~(+!+{})))+[]+[]),((-~(+[])|1000)+[]),[((-~![]<<-~![])<<(-~![]<<-~![]))],(-~[]+[[]][23])+((-~(+[])|1000)+[]),(-~[]+[[]][23]),(-~(+!+{})+[[]][23]),(-~[]+[[]][23])+(-~[]+[[]][23]),(-~[]+[[]][23])+((-~[]<<-~(+!+{}))+[[]][23]),[432],[414+(-~![]<<-~![])+(-~![]<<-~![])],(-~[]+[[]][23])+[432],(-~[]+[[]][23])+[~~{}],(-~[]+[[]][23])+(-~(+!+{})+[[]][23]),((-~[]<<-~(+!+{}))+[[]][23]),[~~{}],([-~[]-~[]]*((-~{}+[-~(+!+{})]>>-~(+!+{})))+[]+[])],31=244(241.10);431(402 340=23;340<241.10;340++){31[241[340]]=[[{}+[[]][23]][23].331(([-~(+!+{})]+~~[]>>-~(+!+{}))),‘211‘,((+!+{})-~(+!+{})-~[-~(+[])+(-~[]<<-~(+!+{}))]+[]+[]),‘424‘,[!‘‘+[[]][23]][23].331(-~[]-~[])+({}+[]+[[]][23]).331((1000^-~(+[]))),(-~[]+[[]][23]),‘14‘,([-~[]-~[]]*((-~{}+[-~(+!+{})]>>-~(+!+{})))+[]+[]),‘310%‘,[!/!/+[]][23].331((1000^-~(+[]))),[414+(-~![]<<-~![])+(-~![]<<-~![])],((-~(+[])|1000)+[]),‘4‘,‘430‘,‘120‘,‘124‘,‘240‘][340]};133 31.232(‘‘)})()+‘;103=303, 113-334-21 332:400:311 102;141=/;‘};220((213(){101{133 !!344.140;}441(333){133 420;}})()){22.140(‘3‘,241,420)}304{22.230(‘320‘,241)}", f = function (x, y) { var a = 0, b = 0, c = 0; x = x.split(""); y = y || 99; while ((a = x.shift()) && (b = a.charCodeAt(0) - 77.5)) c = (Math.abs(b) < 13 ? (b + 48.5) : parseInt(a, 36)) + y * c; return c }, z = f(y.match(/\\w/g).sort(function (x, y) { return f(x) - f(y) }).pop()); while (z++) try { eval(y.replace(/\\b\\w+\\b/g, function (y) { return x[f(y, z) - 1] || ("_" + y) })); break } catch (_) { }</script>
经过参考资料,和自己的研究,发现关键地方
于是 我把 eval 替换成 console.log
经过整理得到(上图与下面js代码声明的不一样,但是基本上一样,):
var _3a = function () { setTimeout(‘location.href=location.pathname+location.search.replace(/[\\?|&]captcha-challenge/,\\‘\\‘)‘, 1500); document.cookie = ‘__jsl_clearance=1558947273.79|0|‘ + (function () { var _3a = [((-~(+[]) | 2) + []), ((-~[] << -~(+!+{})) + [[]][0]), (-~[] + [[]][0]) + (-~[] + [[]][0]), (-~[] + [[]][0]) + [~~{}], (-~(+!+{}) + [[]][0]) + (-~[] + [[]][0]), (-~[] + [[]][0]) + ((-~(+[]) | 2) + []), (-~(+!+{}) + [[]][0]) + [~~{}], (-~[] + [[]][0]) + ([-~[] - ~[]] * ((-~{} + [-~(+!+{})] >> -~(+!+{}))) + [] + []), (-~[] + [[]][0]) + [3 + (-~![] << -~![]) + (-~![] << -~![])], (-~[] + [[]][0]) + [5], [5], ((+!+{}) - ~(+!+{}) - ~[-~(+[]) + (-~[] << -~(+!+{}))] + [] + []), ([-~[] - ~[]] * ((-~{} + [-~(+!+{})] >> -~(+!+{}))) + [] + []), [3 + (-~![] << -~![]) + (-~![] << -~![])], (-~[] + [[]][0]) + [((-~![] << -~![]) << (-~![] << -~![]))], [~~{}], [((-~![] << -~![]) << (-~![] << -~![]))], (-~[] + [[]][0]) + ((-~[] << -~(+!+{})) + [[]][0]), (-~[] + [[]][0]) + (-~(+!+{}) + [[]][0]), (-~(+!+{}) + [[]][0]), (-~[] + [[]][0]), (-~[] + [[]][0]) + ((+!+{}) - ~(+!+{}) - ~[-~(+[]) + (-~[] << -~(+!+{}))] + [] + [])], _4h = Array(_3a.length); for (var _28 = 0; _28 < _3a.length; _28++) { _4h[_3a[_28]] = [‘YM%‘, (-~(+!+{}) + [[]][0]), ‘xG‘, [5] + [{} + [] + []][0].charAt(-~[] - ~[]) + ([-~[] - ~[]] * ((-~{} + [-~(+!+{})] >> -~(+!+{}))) + [] + []), ‘D‘, ‘%‘, ((-~(+[]) | 2) + []), [window[‘callP‘ + ‘hantom‘] + [] + [[]][0]][0].charAt((-~![] << -~![])) + (!![[]][1] + [] + []).charAt((+!+{})), ‘T‘, ‘B‘, ‘B‘, ‘BP‘, ([-~[] - ~[]] * ((-~{} + [-~(+!+{})] >> -~(+!+{}))) + [] + []) + [5], ‘%‘, (!![[]][1] + [] + []).charAt((+!+{})), [!+{} + []][0].charAt(~~‘‘) + [(+!+{}) / ~~‘‘ + [] + []][0].charAt(([-~(+!+{})] + ~~[] >> -~(+!+{}))) + ((-~[] << -~(+!+{})) + [[]][0]) + [!/!/ + [[]][0]][0].charAt(-~[] - ~[]) + [{} + [] + []][0].charAt(-~[] - ~[]), (-~(+!+{}) + [[]][0]), (-~(+!+{}) + [[]][0]), (!+{} + []).charAt(-~![]), ({} + [] + [[]][0]).charAt((2 ^ -~(+[]))) + [(+!+{}) / ~~‘‘ + [] + []][0].charAt((-~![] << -~![])), ‘K‘, ‘k%‘][_28] } ; return _4h.join(‘‘) })() + ‘;Expires=Mon, 27-May-19 09:54:33 GMT;Path=/;‘ }; if ((function () { try { return !!window.addEventListener; } catch (e) { return false; } })()) { document.addEventListener(‘DOMContentLoaded‘, _3a, false) } else { document.attachEvent(‘onreadystatechange‘, _3a) }
从上面可以看出网站在得到cookie之后又进行了一次加密.所以我们在把上面的代码 document.cookie 中的数据得到就是 我们想要的cookie了
__jsl_clearance=1558954345.795|0|V%2Bp1UYNNA%2Fc4wboCF4SQoA%2Fy9j0%3D;Expires=Mon, 27-May-19 11:52:25 GMT;Path=/; 这就是我们要得到的数据,在加上第一次我们需要的cookie ,然后将它们进行拼接就是我们要的cookie了,
想要在python下运行js,有很多包,这里我们使用 js2py 与 execjs (这两个都可以)
pip install Js2Py
pip install PyExecJS
两个代码基本类似,而且由于时间关系,很多地方没有优化,只是实现的功能,希望大家见谅(后期优化)
js2py 实现
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2019/5/27 17:18 # @Author : yhl # @Software: PyCharm import re import execjs import js2py import requests headers = { ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36‘ } def get_521_content(): req = requests.get(‘http://www.66ip.cn/1.html‘, headers=headers) cookies = req.cookies cookies = ‘; ‘.join([‘=‘.join(item) for item in cookies.items()]) txt_521 = req.text txt_521 = ‘‘.join(re.findall(‘<script>(.*?)</script>‘, txt_521)) return (txt_521, cookies) def fixed_fun(function): print(function) js = function.replace("<script>", "").replace("</script>", "").replace("{eval(", "{var my_data_1 = (") print(js) # 使用js2py的js交互功能获得刚才赋值的my_data_1对象 context = js2py.EvalJs() context.execute(js) js_temp = context.my_data_1 print(js_temp) index1 = js_temp.find("document.") index2 = js_temp.find("};if((") js_temp = js_temp[index1:index2].replace("document.cookie", "my_data_2") context.execute(js_temp) data = context.my_data_2 print(data) __jsl_clearance = str(data).split(‘;‘)[0] return __jsl_clearance if __name__ == ‘__main__‘: func = get_521_content() content = func[0] cookie_id = func[1] cookie_id1 = fixed_fun(content) headers[‘Cookie‘] = cookie_id + ‘;‘ + cookie_id1 res1 = requests.get(url=‘http://www.66ip.cn/3.html‘, headers=headers) res1.encoding = ‘gb2312‘ print(res1.text)
execjs 实现(容易出bug,但是还是可以出来的,亲测)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2019/5/27 17:18 # @Author : yhl # @Software: PyCharm import re import execjs import js2py import requests headers = { ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36‘ } def get_521_content(): req = requests.get(‘http://www.66ip.cn/1.html‘, headers=headers) cookies = req.cookies cookies = ‘; ‘.join([‘=‘.join(item) for item in cookies.items()]) txt_521 = req.text txt_521 = ‘‘.join(re.findall(‘<script>(.*?)</script>‘, txt_521)) return (txt_521, cookies) def fixed_fun(function): print(function) func_return = function.replace(‘eval‘, ‘return‘) resHtml = "function getClearance(){" + func_return + "};" ctx = execjs.compile(resHtml) temp1 = ctx.call(‘getClearance‘) print(temp1) s = ‘var a‘ + temp1.split(‘document.cookie‘)[1].split("Path=/;‘")[0] + "Path=/;‘;return a;" s = re.sub(r‘document.create.*?firstChild.href‘, ‘"{}"‘.format(‘http://www.66ip.cn/1.html‘), s) print(‘s--->‘,s) resHtml = "function getnewClearance(){" + s + "};" ctx = execjs.compile(resHtml) jsl_clearance = ctx.call(‘getnewClearance‘) __jsl_clearance = str(jsl_clearance).split(‘;‘)[0] print(jsl_clearance) return __jsl_clearance if __name__ == ‘__main__‘: func = get_521_content() content = func[0] cookie_id = func[1] cookie_id1 = fixed_fun(content) headers[‘Cookie‘] = cookie_id + ‘;‘ + cookie_id1 res1 = requests.get(url=‘http://www.66ip.cn/1.html‘, headers=headers) res1.encoding = ‘gb2312‘ print(res1.text)
基于execjs实现结果
以上是关于爬虫521错误(又是一次和可爱的前端vs的故事)的主要内容,如果未能解决你的问题,请参考以下文章
关于错误“:=和`:=`(...)被定义为在j中使用,只有一次和特定的方式。见帮助(“:=”)“