python tci.py

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python tci.py相关的知识,希望对你有一定的参考价值。

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2016-04-28 06:16:32
# Project: TCI_Test

from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    
    headers= {
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding':'gzip, deflate, sdch',
        'Accept-Language':'zh-CN,zh;q=0.8',
        'Cache-Control':'max-age=0',
        'Connection':'keep-alive',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
        }
    
    crawl_config = {
        'headers' : headers,
        'timeout' : 100
    }

    @every(minutes=24 * 60 * 30)
    def on_start(self):
        self.crawl('http://www.tcichemicals.com/eshop/zh/cn/category_index/00001/', callback=self.index_page)
        self.crawl('http://www.tcichemicals.com/eshop/zh/cn/category_index/00002/', callback=self.index_page)

    def index_page(self, response):
        for each in response.doc('DIV#contents>DIV.section-box>DIV.text>table ul.mark a').items():
            self.crawl(each.attr.href, callback=self.sub_index_page)

    def sub_index_page(self, response):
        sub_titles = list(response.doc('DIV#contents>DIV.section-box>h3.sub-titleA>a').items())
        if  len(sub_titles) > 0:
            for each in sub_titles:
                self.crawl(each.attr.href, callback=self.list_page)
        else:
            self.list_page(response)

    def list_page(self, response):
        for each in response.doc('dl.chem-name dd a').items():
            self.crawl(each.attr.href, callback=self.detail_page)
    
    def detail_page(self, response):
        eng_name = ''
        chn_name = ''
        for each in response.doc('table.syg-tbl tr').items():
            if each('th').text().find('中文') >= 0 and chn_name=='' :
                chn_name = each('td').text()
            elif each('th').text().find('英文') >= 0 and eng_name=='' :
                eng_name = each('td').text()

        return {
            "url": response.url,
            "chn_name": chn_name,
            "eng_name": eng_name,
            "cas": response.doc('form#cart DIV.section-box>table.base-tbl td>span').eq(3).text()
        }

以上是关于python tci.py的主要内容,如果未能解决你的问题,请参考以下文章

001--python全栈--基础知识--python安装

Python代写,Python作业代写,代写Python,代做Python

Python开发

Python,python,python

Python 介绍

Python学习之认识python