python 自己写的爬虫脚本,爬取搜索页的广告链接,并将其存储到练成中
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 自己写的爬虫脚本,爬取搜索页的广告链接,并将其存储到练成中相关的知识,希望对你有一定的参考价值。
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from bs4 import BeautifulSoup
import xlsxwriter
import threading
import requests
import sys, re, time
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
def data_clean(link_data):
"""
清洗获取到的链接
"""
if re.search("[a-zA-Z]", link_data):
raw_url = re.findall("[a-zA-z0-9:./]+", link_data)[0]
for each in raw_url.split("/"):
if re.match('[\w]+\.[\w]+', each) and not re.match("[A-Za-z0-9]*\.(360|so)\.(com|cn)", each):
return each
return None
def get_page_links(platform, search_keyword):
"""
爬取数据
"""
url = url_prefix[platform] + search_keyword
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
if platform == '搜狗':
page_links = soup.select("div.sponsored cite")
elif platform == '百度':
ads = soup.select("span[data-tuiguang]")
page_links = []
for ad in ads:
page_links.append(ad.parent.parent.previous_sibling.previous_sibling.previous_sibling.previous_sibling.previous_sibling)
elif platform == '360':
# page_links = soup.select("div#m-spread-left cite, div#m-spread-bottom cite")
page_links=[]
ads = soup.select("#e_haoso_fengwu_extend > span")
for ad in ads:
page_links.append(ad.previous_sibling.previous_sibling.previous_sibling.previous_sibling)
else:
raise Exception('该搜索平台未收录')
return page_links
def write_data(worksheet, search_keyword, page_links, col, first_row):
"""
执行具体的写入工作
"""
worksheet.write(0, col, search_keyword)
for each in page_links:
if each.get_text():
data = each.get_text().strip()
cleaned_data = data_clean(data)
print(data)
if cleaned_data:
first_row += 1
worksheet.write(first_row, col, cleaned_data)
def write_to_worksheet(worksheet, first_row):
"""
按照指定的行数,从第零列开始写入爬取的数据
"""
col = 0
search_keywords = ['网店转让', '网店出售转让', '网店转让平台', '天猫转让', '天猫店转让', '天猫转让平台', '氧趣网', '风向区']
if sys.argv[2:]:
search_keywords = re.split("\W+", sys.argv[2:][0])
for search_keyword in search_keywords:
page_links = get_page_links(worksheet.name, search_keyword)
write_data(worksheet, search_keyword, page_links, col, first_row)
col += 1
def create_worksheet(platform):
worksheet = workbook.add_worksheet()
worksheet.name = platform
worksheet.set_row(0, 20, format)
worksheet.set_column(0, 20, 25)
for i in range(number_of_replications):
first_row = i * 10
write_to_worksheet(worksheet, first_row)
worksheet.name = worksheet.name + " " + time.strftime('%Y-%m-%d %H:%M:%S')
if __name__ == '__main__':
workbook = xlsxwriter.Workbook('SEO_WebScraping.xlsx')
format = workbook.add_format({'bold': True, 'font_name': 'Microsoft YaHei UI', 'font_size':'14'})
number_of_replications = int(sys.argv[1]) if sys.argv[1:] else 5
url_prefix = {
"搜狗" : "https://www.sogou.com/web?query=",
"百度" : "http://www.baidu.com/s?ie=UTF-8&wd=",
"360" : "https://www.so.com/s?fr=none&src=srp&q="
}
threads = []
for platform in url_prefix.keys():
t = threading.Thread(target=create_worksheet, args=(platform,))
threads.append(t)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
workbook.close()
以上是关于python 自己写的爬虫脚本,爬取搜索页的广告链接,并将其存储到练成中的主要内容,如果未能解决你的问题,请参考以下文章
Scrapy(Python)爬虫框架案例实战教程,Mysql存储数据