爬虫:爬取海词的翻译内容

Posted 细雨微光

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫:爬取海词的翻译内容相关的知识,希望对你有一定的参考价值。

在爬取海词的时候遇到了一个问题,在异步加载的时候,需要一个t值,但是这个t值是js加载出来的,直接拼接的,我们无法从网页中得到;

 当在搜索框输入单词的时候:你在干嘛

搜索

替换下图中的page的值就能达到翻页的目的:

那么当前的目的就是要能够找到这段js代码,同时获取对应输入的t的值,来重新组合url

真正的url只需要如下内容:

我提前把关键字和t都处理了,写成了字典的形式,

key:你在干嘛  ff[key]:WuqarCRs

{“你好”:“WuqarCRs”}  #提前处理成了这种,方便提取
url = "http://fuzz.dict.cn/dict/api.php?&action=fuzz&from=jsonp&q=" + key + "&t="+ ff[key]+"&page="

 

那么关键部分来了,我是如何获取T的呢。

大概思路,是在本地搭建一个服务器,然后输入每个词去访问这段js代码,然后返回给词的结果保存起来。

处理过程的代码:

 

第一步:先找到那段js代码,里面是如何把输入的文字转换成8位字符串的算法

第二步:先安装node.js 服务器,然后提取出来这段js代码,转换成node.js代码,如果不转换的话在浏览器里面直接访问时无法触发js加载的。

下面是node.js的代码,先执行node.js代码

var http = require(\'http\');
var querystring = require(\'querystring\');
var util = require(\'util\');

http.createServer(function(req, res){
    var post = \'\';     
    var mm = \'\'
    
    
function dictCrypto(J) {
  function r(g, f) {
    var e, d, a, b, c;
    a = g & 2147483648;
    b = f & 2147483648;
    e = g & 1073741824;
    d = f & 1073741824;
    c = (g & 1073741823) + (f & 1073741823);
    if (e & d) {
      return c ^ 2147483648 ^ a ^ b
    }
    return e | d ? c & 1073741824 ? c ^ 3221225472 ^ a ^ b : c ^ 1073741824 ^ a ^ b : c ^ a ^ b
  }

  function I(g, f, e, d, a, b, c) {
    g = r(g, r(r(f & e | ~f & d, a), c));
    return r(g << b | g >>> 32 - b, f)
  }

  function s(g, f, e, d, a, b, c) {
    g = r(g, r(r(f & d | e & ~d, a), c));
    return r(g << b | g >>> 32 - b, f)
  }

  function w(g, f, e, d, a, b, c) {
    g = r(g, r(r(f ^ e ^ d, a), c));
    return r(g << b | g >>> 32 - b, f)
  }

  function v(g, f, e, d, a, b, c) {
    g = r(g, r(r(e ^ (f | ~d), a), c));
    return r(g << b | g >>> 32 - b, f)
  }

  function K(c) {
    for (var b = "++"; c > 0;) {
      var a = c % 64;
      b += a == 0 ? "+" : a == 1 ? "-" : a > 1 && a < 12 ? String.fromCharCode(a + 46) : a > 11 && a < 38 ? String.fromCharCode(a + 54) : String.fromCharCode(a + 59);
      c = (c - a) / 64
    }
    return b.substr(b.length - 2, 2)
  }

  function H(d) {
    var c = "",
      b = "",
      a;
    for (a = 0; a <= 3; a++) {
      b = d >>> a * 8 & 255;
      b = "0" + b.toString(16);
      c += b.substr(b.length - 2, 2)
    }
    return c
  }
  var x = [],G, L, q, p, F, E, D, C;
  J = function(d) {
      // var rrr = //;
      // d = d.replace(rrr,"");
      process.stdout.write(d +"***"+\'\\n\');
      process.stdout.write(typeof rrr);
    d = d.replace(/\\r\\n/g, "\\n");

    for (var c = "",b = 0; b < d.length; b++) {
      var a = d.charCodeAt(b);
      if (a < 128) {
        c += String.fromCharCode(a)
      } else {
        if (a > 127 && a < 2048) {
          c += String.fromCharCode(a >> 6 | 192)
        } else {
          c += String.fromCharCode(a >> 12 | 224);
          c += String.fromCharCode(a >> 6 & 63 | 128)
        }
        c += String.fromCharCode(a & 63 | 128)
      }
    }
    c += String.fromCharCode(80, 97, 83, 115);
    if (global.dict_pagetoken) {
      c += global.dict_pagetoken
    }
    return c
  }(J);
  x = function(g) {
    var f, e = g.length;
    f = e + 8;
    for (var d = ((f - f % 64) / 64 + 1) * 16, a = Array(d - 1), b = 0, c = 0; c < e;) {
      f = (c - c % 4) / 4;
      b = c % 4 * 8;
      a[f] |= g.charCodeAt(c) << b;
      c++
    }
    a[(c - c % 4) / 4] |= 128 << c % 4 * 8;
    a[d - 2] = e << 3;
    a[d - 1] = e >>> 29;
    return a
  }(J);
  F = 1732584193;
  E = 4023233417;
  D = 2562383102;
  C = 271733878;
  for (J = 0; J < x.length; J += 16) {
    G = F;
    L = E;
    q = D;
    p = C;
    F = I(F, E, D, C, x[J + 0], 7, 3614090360);
    C = I(C, F, E, D, x[J + 1], 12, 3905402710);
    D = I(D, C, F, E, x[J + 2], 17, 606105819);
    E = I(E, D, C, F, x[J + 3], 22, 3250441966);
    F = I(F, E, D, C, x[J + 4], 7, 4118548399);
    C = I(C, F, E, D, x[J + 5], 12, 1200080426);
    D = I(D, C, F, E, x[J + 6], 17, 2821735955);
    E = I(E, D, C, F, x[J + 7], 22, 4249261313);
    F = I(F, E, D, C, x[J + 8], 7, 1770035416);
    C = I(C, F, E, D, x[J + 9], 12, 2336552879);
    D = I(D, C, F, E, x[J + 10], 17, 4294925233);
    E = I(E, D, C, F, x[J + 11], 22, 2304563134);
    F = I(F, E, D, C, x[J + 12], 7, 1804603682);
    C = I(C, F, E, D, x[J + 13], 12, 4254626195);
    D = I(D, C, F, E, x[J + 14], 17, 2792965006);
    E = I(E, D, C, F, x[J + 15], 22, 1236535329);
    F = s(F, E, D, C, x[J + 1], 5, 4129170786);
    C = s(C, F, E, D, x[J + 6], 9, 3225465664);
    D = s(D, C, F, E, x[J + 11], 14, 643717713);
    E = s(E, D, C, F, x[J + 0], 20, 3921069994);
    F = s(F, E, D, C, x[J + 5], 5, 3593408605);
    C = s(C, F, E, D, x[J + 10], 9, 38016083);
    D = s(D, C, F, E, x[J + 15], 14, 3634488961);
    E = s(E, D, C, F, x[J + 4], 20, 3889429448);
    F = s(F, E, D, C, x[J + 9], 5, 568446438);
    C = s(C, F, E, D, x[J + 14], 9, 3275163606);
    D = s(D, C, F, E, x[J + 3], 14, 4107603335);
    E = s(E, D, C, F, x[J + 8], 20, 1163531501);
    F = s(F, E, D, C, x[J + 13], 5, 2850285829);
    C = s(C, F, E, D, x[J + 2], 9, 4243563512);
    D = s(D, C, F, E, x[J + 7], 14, 1735328473);
    E = s(E, D, C, F, x[J + 12], 20, 2368359562);
    F = w(F, E, D, C, x[J + 5], 4, 4294588738);
    C = w(C, F, E, D, x[J + 8], 11, 2272392833);
    D = w(D, C, F, E, x[J + 11], 16, 1839030562);
    E = w(E, D, C, F, x[J + 14], 23, 4259657740);
    F = w(F, E, D, C, x[J + 1], 4, 2763975236);
    C = w(C, F, E, D, x[J + 4], 11, 1272893353);
    D = w(D, C, F, E, x[J + 7], 16, 4139469664);
    E = w(E, D, C, F, x[J + 10], 23, 3200236656);
    F = w(F, E, D, C, x[J + 13], 4, 681279174);
    C = w(C, F, E, D, x[J + 0], 11, 3936430074);
    D = w(D, C, F, E, x[J + 3], 16, 3572445317);
    E = w(E, D, C, F, x[J + 6], 23, 76029189);
    F = w(F, E, D, C, x[J + 9], 4, 3654602809);
    C = w(C, F, E, D, x[J + 12], 11, 3873151461);
    D = w(D, C, F, E, x[J + 15], 16, 530742520);
    E = w(E, D, C, F, x[J + 2], 23, 3299628645);
    F = v(F, E, D, C, x[J + 0], 6, 4096336452);
    C = v(C, F, E, D, x[J + 7], 10, 1126891415);
    D = v(D, C, F, E, x[J + 14], 15, 2878612391);
    E = v(E, D, C, F, x[J + 5], 21, 4237533241);
    F = v(F, E, D, C, x[J + 12], 6, 1700485571);
    C = v(C, F, E, D, x[J + 3], 10, 2399980690);
    D = v(D, C, F, E, x[J + 10], 15, 4293915773);
    E = v(E, D, C, F, x[J + 1], 21, 2240044497);
    F = v(F, E, D, C, x[J + 8], 6, 1873313359);
    C = v(C, F, E, D, x[J + 15], 10, 4264355552);
    D = v(D, C, F, E, x[J + 6], 15, 2734768916);
    E = v(E, D, C, F, x[J + 13], 21, 1309151649);
    F = v(F, E, D, C, x[J + 4], 6, 4149444226);
    C = v(C, F, E, D, x[J + 11], 10, 3174756917);
    D = v(D, C, F, E, x[J + 2], 15, 718787259);
    E = v(E, D, C, F, x[J + 9], 21, 3951481745);
    F = r(F, G);
    E = r(E, L);
    D = r(D, q);
    C = r(C, p)
  }
  return function(d) {
      var c = parseInt("0x" + d.substr(0, 3), 16),
        b = parseInt("0x" + d.substr(3, 3), 16),
        a = parseInt("0x" + d.substr(6, 3), 16);
      d = parseInt("0x" + d.substr(9, 3), 16);
      return K(c) + K(b) + K(a) + K(d);
      console.log(K(c) + K(b) + K(a) + K(d))
    }
    (H(F).substr(0, 4) + H(E).substr(0, 4) + H(D).substr(0, 4))
}
  //传过来的时候,chunk = “你好”
    req.on(\'data\', function(chunk){    #添加post请求
        process.stdout.write(chunk+\'\\n\');
        // // process.stdout.write(hh + \'\\n\');
        // var hhh = "你好"
        rrr = chunk.toString()
        process.stdout.write(typeof rrr  + \'\\n\');

        process.stdout.write(rrr+\'\\n\');
        mm = dictCrypto(rrr);
        post +=mm;
    });

    req.on(\'end\', function(){    
        post = querystring.parse(post);
        res.end(util.inspect(post));
    });
}).listen(8888);

console.log(\'Server running at http://127.0.0.1:8888/\');
View Code

 

第三步:正常的python代码,去访问本地的服务器,直接把转换完的数据存储到本地

#! /usr/bin/env python
#coding: utf-8
import re
import os
import requests
import sys
import json
reload(sys)
sys.setdefaultencoding(\'utf-8\')
path = "D:\\\\106_data\\\\juhai_data\\\\"
ff = open(path + "answer_1.txt",\'a\')
f = open("data_1.dict")   #这个是你的词典,按照行来访问词典
tt = {}
i = 1
j = 1

s = requests.session()
s.keep_alive = False

while 1:
    word = f.readline()
    if not word:
        ans = json.dumps(tt)
        ff.write(ans)
        break
    print word,
    if (i%100000 == 0):#一万个词存储一次,存的格式为字典
        j = j + 1
        ans = json.dumps(tt)
        ff.write(ans)
        ff.close()
        ff = open(path + "answer_" +str(j) + ".txt",\'a\')
        tt = {}
    word = word.strip(\'\\n\')
    html = requests.post("http://127.0.0.1:8888/",data =word,headers={\'Connection\':\'close\'})
    print html.text
    xx = re.search("{ (.*?): \'\' }",html.text,re.S)#用到了正则去提取内容
    try:
        xx = xx.group(1)
        xx = xx.strip("\'")
    except:
        continue
    tt[word] = xx
    print xx
    i = i+1
    s = requests.session()
    s.keep_alive = False
ff.close()
f.close()

 

以上是关于爬虫:爬取海词的翻译内容的主要内容,如果未能解决你的问题,请参考以下文章

scrapy主动退出爬虫的代码片段(python3)

Python 利用爬虫爬取网页内容 (div节点的疑惑)

Python 爬虫篇 - 调用有道翻译api接口翻译外文网站的整篇西班牙文实战演示。爬取西班牙语文章调用有道翻译接口进行整篇翻译

python --爬虫--爬取百度翻译

Python爬虫之破解百度翻译--requests案例详解

网页爬虫:零基础用爬虫爬取网页内容