Javascript爬虫
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Javascript爬虫相关的知识,希望对你有一定的参考价值。
想做一个js的爬虫,要用Jquery使用Ajax异步去抓取一些页面的<html>里面的<head>里面的<script>里面的var的一个变量的值,怎么破
你是不是想错了,AJAX异步抓取页面的HTML元素里的值,说实话,不现实。你说你要去抓取我不否认,但你要用Ajax去抓取,你还是别想了吧
参考技术A
ajax 先解决跨域问题就不错了 抓取比较难 htmlunit+jsoup本回答被提问者采纳
javascript 邮件爬虫
'use strict'
var MailListener = require('mail-listener-fixed')
const settings = require('standard-settings').getSettings()
settings.service.mail.debug = console.log
var mailListener = new MailListener(settings.service.mail)
var admin = require('firebase-admin')
const request = require('request')
const fs = require('fs')
const gm = require('gm').subClass({imageMagick: true})
const async = require('async')
var serviceAccount = require(settings.service.firebase.key.path)
var maxParrallel = settings.service.maxParallel.value
/* let HeaderX = {
'eventKey': 'x-event',
'from': 'x-from'
} */
admin.initializeApp({
credential: admin.credential.cert(serviceAccount),
databaseURL: `https://${settings.service.firebase.database.name}.firebaseio.com`,
storageBucket: `${settings.service.storage.name}.appspot.com`
})
var q = async.queue(function (mail, callback) {
uploadFile(mail, mail.attachments[0], mail.from, mail.subject, mail.html)
}, maxParrallel)
q.drain = function () {
console.log('All the media have been uploaded')
}
mailListener.start() // start listening
mailListener.on('server:connected', () => {
console.log('imapConnected')
})
mailListener.on('server:disconnected', () => {
console.log('imapDisconnected')
mailListener.start()
})
mailListener.on('error', (err) => {
console.log(err)
})
mailListener.on('mail', (mail, seqno, attributes) => {
if (mail.attachments) {
q.push(mail)
}
console.log(mail.headers)
console.log('emailParsed', mail.attachments)
console.log('emailParsed', mail.from)
console.log('emailParsed', mail.subject)
})
mailListener.on('attachment', function (attachment) {
// console.log(attachment.path)
})
let uploadFile = (mail, file, from, subject, html) => {
// auto-orient an image
gm(file.path)
.autoOrient()
.write(file.path, function (err) {
// var fromMail = from[0].address
var eventbucket = settings.service.bucket.name
var eventbuckettoken = settings.service.bucket.token
// if (mail.headers[HeaderX.from]) {
// fromMail = mail.headers[HeaderX.from]
// }
var formData = {
name: file.fileName,
file: fs.createReadStream(file.path),
token: eventbuckettoken,
bucket: eventbucket,
mailto: 'dduvacher@gmail.com'
}
request.post({url: settings.service.socialiteAPI.URL, formData: formData}, function optionalCallback (err, httpResponse, body) {
if (err) {
return console.error('upload failed:', err)
} else {
fs.unlink(file.path)
console.log('Upload successful! Server responded with:', body)
}
})
if (err) {
console.error(err)
}
})
}
以上是关于Javascript爬虫的主要内容,如果未能解决你的问题,请参考以下文章
Python爬虫编程思想(109):基于Splash的爬虫--执行JavaScript的n种方式
爬虫 JavaScript 篇[Web 漏洞扫描器]
JAVA系列Google爬虫如何抓取JavaScript的?
Python爬虫(二十四)_selenium案例:执行javascript脚本
如果网页内容是由javascript生成的,应该怎么实现爬虫
javascript BitTorrent DHT爬虫