python 与V8 javascript解析器的Scrapy蜘蛛。更多信息请访问http://dutchcoders.ghost.io/using-scrapy-and-pyv8-to-scrape

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 与V8 javascript解析器的Scrapy蜘蛛。更多信息请访问http://dutchcoders.ghost.io/using-scrapy-and-pyv8-to-scrape相关的知识,希望对你有一定的参考价值。

import PyV8

class js_dom_stylesheet(PyV8.JSClass):
    def __init__(self, document, *args, **kwargs):
        self.cssRules = [ {'cssText': "test"}]
        self.cssText = "test"

    def __getattr__(self, name):
        return super(js_dom_stylesheet, self).__getattr__(name)

    def __setattr__(self, name, value):
        super(js_dom_stylesheet, self).__setattr__(name, value)
        pass

    def __delattr__(self, name):
        super(js_dom_stylesheet, self).__delattr__(name)
        pass


class js_dom_element(PyV8.JSClass):
    def __init__(self, document, *args, **kwargs):
        print ("js_dom_element", args, kwargs)
        self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'}
        self.tagName = "HTML"
        self.nodeType = 9
        self.style = {'background': None }
        self.sheet = self.styleSheet= js_dom_stylesheet(document)
        self.innerHTML = ""
        self.className = ""
        self.id = ""
        self.offsetLeft = self.offsetHeight = 0
        self.document = self.ownerDocument = document
        pass

    def __str__(self):
        return str(self.__properties__) + str(self.__dict__)

    def appendChild(self, *args, **kwargs):
        args[0].parentNode = self
        return args[0]

    def getBoundingClientRect(self, *args, **kwargs):
        return {}

    def removeChild(self, *args, **kwargs):
        return None

    """
    def parentNode(self, *args, **kwargs):
        print ("parentNode")
        return js_dom_element(self.document)
    """

    def insertBefore(self, *args, **kwargs):
        args[0].parentNode = self
        return args[0]

    def offsetTop(self, *args, **kwargs):
        return 0

    def getAttribute(self, *args, **kwargs):
        return None

    def ondrop(self, *args, **kwargs):
        pass

    def ondragstart(self, *args, **kwargs):
        pass

    def setAttribute(self, *args, **kwargs):
        return None
    """
    def __getitem__(self, key):
        print ("__getitem__", key)
        return super(js_dom_element, self).__getitem__(key)

    def __setitem__(self, key, value):
        print ("__setitem__", key)
        super(js_dom_element, self).__setitem__(key, value)
        pass

    def __delitem__(self, key):
        print ("__delitem__", key)
        super(js_dom_element, self).__delitem__(key)
        pass
    """
    def __getattr__(self, name):
        return super(js_dom_element, self).__getattr__(name)

    def __setattr__(self, name, value):
        super(js_dom_element, self).__setattr__(name, value)
        pass

    def __delattr__(self, name):
        super(js_dom_element, self).__delattr__(name)
        pass

    def attachEvent(self, *args, **kwargs):
        pass

    def getComputedStyle(self, *args, **kwargs):
        return {}

    def getElementsByTagName(self, *args, **kwargs):
        return [js_dom_element(self.document)]

    def addEventListener(self, *args, **kwargs):
        pass


class js_window(PyV8.JSClass):
    def __init__(self, document):
        self.location = { 'href': '', 'hostname': 'www.test.nl' }
        self.Event = {}
        self.document = document
        self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'}
        pass

    def top(self):
        return self

    def self(self):
        return self

    def WebSocket(self, *args, **kwargs):
        pass

    def ontouchstart(self, *args, **kwargs):
        pass

    def setTimeout(self, *args, **kwargs):
        pass

    def postMessage(self, *args, **kwargs):
        pass

    def pushState(self, *args, **kwargs):
        pass

    def history(self, *args, **kwargs):
        pass

    def __setattr__(self, name, value):
        print ("js_window.__setattr__", name, value)
        #super(js_window, self).__setattr__(name, value)
        #print (value, self.__getattr__(name))

        try:
            super(js_window, self).__setattr__(name, value)
        except AttributeError as exc:
            print ("__setattr__,AttributeError")
            pass

    def __getattr__(self, name):
        print ("js_window.___getattr__", name)
        print ("js_window.___getattr__", name, super(js_window, self).__getattr__(name))

        try:
            return super(js_window, self).__getattr__(name)
        except AttributeError as exc:
            pass

        return None

    def __delattr__(self, name):
        print ("js_window.__delattr__", name)
        super(js_window, self).__delattr__(name)
        pass

    def addEventListener(self, *args, **kwargs):
        pass

    def attachEvent(self, *args, **kwargs):
        pass

class js_event(PyV8.JSClass):
    def __init__(self):
        self.__proto__ = {}
        pass


class js_document(PyV8.JSClass):
    def __init__(self):
        self.window = js_window(self)
        self.body = js_dom_element(self)
        self.location = { 'href': '', 'hostname': 'www.test.nl' }
        self.documentElement = js_dom_element(self)
        self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'}
        pass

    def appendChild(self, *args, **kwargs):
        return None

    def removeChild(self, *args, **kwargs):
        return None

    def getElementById(self, *args, **kwargs):
        print ("getElementById", args, kwargs)
        return js_dom_element(self)

    def attachEvent(self, *args, **kwargs):
        pass

    def createEvent(self, *args, **kwargs):
        return js_event()

    def getElementsByTagName(self, *args, **kwargs):
        return [js_dom_element(self)]

    def createTextNode(self, *args, **kwargs):
        return js_dom_element(self, *args, **kwargs)

    def createComment(self, *args, **kwargs):
        return js_dom_element(self, *args, **kwargs)

    def createDocumentFragment(self, *args, **kwargs):
        return js_dom_element(self, *args, **kwargs)

    def createElement(self, *args, **kwargs):
        return js_dom_element(self, *args, **kwargs)

    def querySelector(self, *args, **kwargs):
        pass

    def evaluate(self, *args, **kwargs):
        pass

    def observe(self, *args, **kwargs):
        pass

    def __setattr__(self, name, value):
        print ("__setattr__", name)
        super(js_document, self).__setattr__(name, value)
        pass

    def __getattr__(self, name):
        print ("js_document.___getattr__", name)
        try:
            return super(js_document, self).__getattr__(name)
        except AttributeError as exc:
            pass

        return None

    def addEventListener(self, *args, **kwargs):
        pass


    def __delattr__(self, name):
        print ("__delattr__", name)
        super(js_document, self).__delattr__(name)
        pass

class Global(PyV8.JSClass):      # define a compatible javascript class

    def __init__(self):
        self.document = js_document()
        self.window = self.document.window
        self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'}
        self.HTMLElement = js_dom_element(self.document)
        pass

    def Image(self):
        return js_dom_element(self.document)

    def __setattr__(self, name, value):
        super(PyV8.JSClass, self).__setattr__(name, value)
        pass

    def __getattr__(self, name):
        if self.window.__getattr__(name):
            return self.window.__getattr__(name)

        return super(PyV8.JSClass, self).__getattr__(name)

    def __delattr__(self, name):
        super(PyV8.JSClass, self).__delattr__(name)
        pass
# Text of the article

When using Scrapy it is easy to scrape HTML using selectors, but when you are confronted with inline javascript objects in the html it is an other story.

I'm using PyV8 to evaluate the imported scripts and inline javascript. The javascript objects in gistfile2.py allows the javascript libraries to access browser variables, like window, history and selectors. The functions I implemented are sufficient to run jQuery and other frameworks. This will work about the same in the Google crawler, where javascript is also being interpreted and evaluated.

Warning: this is merely a proof of concept, than production ready code.

First you need to download and install PyV8.

Google V8 - http://code.google.com/p/v8/
PyV8 - http://code.google.com/p/pyv8/

This is the code of the scraper: gistfile1.py

This code mimics the browser: gistfile2.py



As you can see, it gets an page, creates a new context using the Global() and evaluates all script tags. If the script tag is remote, it downloads and runs it. The end result is that you can just call for objects within the page, in this case ProducsData and use it as an Python object.

Work todo:

  *  make a nice library
  *  cache the evaluated context and downloaded scripts
  *  further enhance the browser mimicing.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http.request import Request
from scrapy.selector import Selector

import urllib2
import re
import PyV8
import json

from pdc.items import Product

class V8Spider(scrapy.Spider):
    def parse_page(self, response):
        item = response.meta['item']
        item['link'] = response.url
        
        exts = []
    
        productsData = None
    
        with PyV8.JSContext(Global(), extensions=exts) as ctxt:
            for script in response.xpath("//script"):
                try:
                    if (script.xpath("@src")):
                        src = script.xpath("@src").extract()[0]
        
                        import requests
                        r = requests.get(src)
            
                        print("loading script source ", src)
                        ext = PyV8.JSExtension(str(src), js_escape_unicode(r.text))
                        ctxt.eval(js_escape_unicode(r.text))
        
                    if script.xpath("text()").extract():
                        ctxt.eval(str(js_escape_unicode(script.xpath("text()").extract()[0])))
            
                except Exception as exc:
                    import traceback
                    traceback.print_exc()
        
        if ctxt.eval("[productsData]"):
            productsData = PyV8.convert(ctxt.eval("[productsData]")[0])
    
            for color_key in productsData['colors'].keys():
                color = productsData['colors'][color_key]

                for size_key in color['sizes'].keys():
                    size = productsData['sizes'][size_key]

                    product_key = "{0}_{1}".format(color_key, size_key)
                    product = productsData['products'][product_key]

                    subitem = item.copy()
                    subitem['productid']=product['id']
                    subitem['title']= "{0} {1} {2}".format(product['name'], size['label'], color['label'])
                    subitem['img']=color['media']['images'][0]['page']

                    price = Selector(text=productsData['products'][product_key]['price_html'])
                    subitem['price'] = price.xpath('//span[@class="new_price"]/strong/text()').extract()[0].strip() + price.xpath('//span[@class="new_price"]/strong/sup/text()').extract()[0].strip()

                    yield subitem
        else:
            yield item

以上是关于python 与V8 javascript解析器的Scrapy蜘蛛。更多信息请访问http://dutchcoders.ghost.io/using-scrapy-and-pyv8-to-scrape的主要内容,如果未能解决你的问题,请参考以下文章

javaScript的V8引擎

用JavaScript带你体验V8引擎解析标识符过程

V8中的快属性与内联缓存

V8中的快属性与内联缓存

补齐v8基础知识v8与JavaScript简介

V8 JavaScript引擎研究简介