python 自己写的爬虫脚本,爬取搜索页的广告链接,并将其存储到练成中

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 自己写的爬虫脚本,爬取搜索页的广告链接,并将其存储到练成中相关的知识,希望对你有一定的参考价值。

# -*- coding: utf-8 -*-
from __future__ import unicode_literals

from bs4 import BeautifulSoup
import xlsxwriter
import threading
import requests
import sys, re, time

headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
def data_clean(link_data):
    """ 
    清洗获取到的链接
    """
    if re.search("[a-zA-Z]", link_data):
        raw_url = re.findall("[a-zA-z0-9:./]+", link_data)[0]
        for each in raw_url.split("/"):
            if re.match('[\w]+\.[\w]+', each) and not re.match("[A-Za-z0-9]*\.(360|so)\.(com|cn)", each):
                return each
    return None


def get_page_links(platform, search_keyword):
    """ 
    爬取数据
    """
    url = url_prefix[platform] + search_keyword
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    if platform == '搜狗':
        page_links = soup.select("div.sponsored cite")
    elif platform == '百度':
        ads = soup.select("span[data-tuiguang]")
        page_links = []
        for ad in ads:
            page_links.append(ad.parent.parent.previous_sibling.previous_sibling.previous_sibling.previous_sibling.previous_sibling)
    elif platform == '360':
        # page_links = soup.select("div#m-spread-left cite, div#m-spread-bottom cite")
        page_links=[]
        ads = soup.select("#e_haoso_fengwu_extend > span")
        for ad in ads:
            page_links.append(ad.previous_sibling.previous_sibling.previous_sibling.previous_sibling)
    else:
        raise Exception('该搜索平台未收录')
    return page_links


def write_data(worksheet, search_keyword, page_links, col, first_row):
    """ 
    执行具体的写入工作
    """
    worksheet.write(0, col, search_keyword)
    for each in page_links:
        if each.get_text():
            data = each.get_text().strip()
            cleaned_data = data_clean(data)
            print(data)
            if cleaned_data:
                first_row += 1
                worksheet.write(first_row, col, cleaned_data)


def write_to_worksheet(worksheet, first_row):
    """ 
    按照指定的行数,从第零列开始写入爬取的数据
    """
    col = 0
    search_keywords = ['网店转让', '网店出售转让', '网店转让平台', '天猫转让', '天猫店转让', '天猫转让平台', '氧趣网', '风向区']
    if sys.argv[2:]:
        search_keywords = re.split("\W+", sys.argv[2:][0])
    for search_keyword in search_keywords:
        page_links = get_page_links(worksheet.name, search_keyword)
        write_data(worksheet, search_keyword, page_links, col, first_row)
        col += 1


def create_worksheet(platform):
    worksheet = workbook.add_worksheet()
    worksheet.name = platform
    worksheet.set_row(0, 20, format)
    worksheet.set_column(0, 20, 25)

    for i in range(number_of_replications):
        first_row = i * 10
        write_to_worksheet(worksheet, first_row)

    worksheet.name = worksheet.name + " " + time.strftime('%Y-%m-%d %H:%M:%S')

if __name__ == '__main__':
    workbook = xlsxwriter.Workbook('SEO_WebScraping.xlsx')
    format = workbook.add_format({'bold': True, 'font_name': 'Microsoft YaHei UI', 'font_size':'14'})
    number_of_replications = int(sys.argv[1]) if sys.argv[1:] else 5

    url_prefix = {
        "搜狗" : "https://www.sogou.com/web?query=",
        "百度" : "http://www.baidu.com/s?ie=UTF-8&wd=",
        "360"  : "https://www.so.com/s?fr=none&src=srp&q="
        }

    threads = []
    for platform in url_prefix.keys():
        t = threading.Thread(target=create_worksheet, args=(platform,))
        threads.append(t)
        
    for thread in threads:
        thread.start()

    for thread in threads:
        thread.join()

    workbook.close()


以上是关于python 自己写的爬虫脚本,爬取搜索页的广告链接,并将其存储到练成中的主要内容,如果未能解决你的问题,请参考以下文章

Scrapy(Python)爬虫框架案例实战教程,Mysql存储数据

我现在有一套在网站上爬取数据的程序(用python写的)如何在服务器运行

Python网络爬虫与信息提取(中国大学mooc)

python爬取百度搜索页面,得到内容不全,求教,why

Python3爬虫爬取淘宝商品数据

python爬虫:爬取百度云盘资料,保存下载地址链接标题链接详情