Javascript爬虫

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Javascript爬虫相关的知识,希望对你有一定的参考价值。

想做一个js的爬虫,要用Jquery使用Ajax异步去抓取一些页面的<html>里面的<head>里面的<script>里面的var的一个变量的值,怎么破

你是不是想错了,AJAX异步抓取页面的HTML元素里的值,说实话,不现实。你说你要去抓取我不否认,但你要用Ajax去抓取,你还是别想了吧 参考技术A ajax 先解决跨域问题就不错了 抓取比较难 htmlunit+jsoup本回答被提问者采纳

javascript 邮件爬虫

'use strict'

var MailListener = require('mail-listener-fixed')
const settings = require('standard-settings').getSettings()
settings.service.mail.debug = console.log
var mailListener = new MailListener(settings.service.mail)
var admin = require('firebase-admin')
const request = require('request')
const fs = require('fs')
const gm = require('gm').subClass({imageMagick: true})
const async = require('async')

var serviceAccount = require(settings.service.firebase.key.path)
var maxParrallel = settings.service.maxParallel.value

/* let HeaderX = {
  'eventKey': 'x-event',
  'from': 'x-from'
} */

admin.initializeApp({
  credential: admin.credential.cert(serviceAccount),
  databaseURL: `https://${settings.service.firebase.database.name}.firebaseio.com`,
  storageBucket: `${settings.service.storage.name}.appspot.com`
})

var q = async.queue(function (mail, callback) {
  uploadFile(mail, mail.attachments[0], mail.from, mail.subject, mail.html)
}, maxParrallel)

q.drain = function () {
  console.log('All the media have been uploaded')
}

mailListener.start() // start listening

mailListener.on('server:connected', () => {
  console.log('imapConnected')
})

mailListener.on('server:disconnected', () => {
  console.log('imapDisconnected')
  mailListener.start()
})

mailListener.on('error', (err) => {
  console.log(err)
})

mailListener.on('mail', (mail, seqno, attributes) => {
  if (mail.attachments) {
    q.push(mail)
  }
  console.log(mail.headers)
  console.log('emailParsed', mail.attachments)
  console.log('emailParsed', mail.from)
  console.log('emailParsed', mail.subject)
})

mailListener.on('attachment', function (attachment) {
  // console.log(attachment.path)
})

let uploadFile = (mail, file, from, subject, html) => {
  // auto-orient an image
  gm(file.path)
  .autoOrient()
  .write(file.path, function (err) {
    // var fromMail = from[0].address
    var eventbucket = settings.service.bucket.name
    var eventbuckettoken = settings.service.bucket.token

    // if (mail.headers[HeaderX.from]) {
    //   fromMail = mail.headers[HeaderX.from]
    // }

    var formData = {
      name: file.fileName,
      file: fs.createReadStream(file.path),
      token: eventbuckettoken,
      bucket: eventbucket,
      mailto: 'dduvacher@gmail.com'
    }

    request.post({url: settings.service.socialiteAPI.URL, formData: formData}, function optionalCallback (err, httpResponse, body) {
      if (err) {
        return console.error('upload failed:', err)
      } else {
        fs.unlink(file.path)
        console.log('Upload successful!  Server responded with:', body)
      }
    })
    if (err) {
      console.error(err)
    }
  })
}

以上是关于Javascript爬虫的主要内容,如果未能解决你的问题,请参考以下文章

Python爬虫编程思想(109):基于Splash的爬虫--执行JavaScript的n种方式

爬虫 JavaScript 篇[Web 漏洞扫描器]

JAVA系列Google爬虫如何抓取JavaScript的?

Python爬虫(二十四)_selenium案例:执行javascript脚本

如果网页内容是由javascript生成的,应该怎么实现爬虫

javascript BitTorrent DHT爬虫