python --selenium+phantomjs爬取动态页面广告源码
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python --selenium+phantomjs爬取动态页面广告源码相关的知识,希望对你有一定的参考价值。
背景:利用爬虫,爬取网站页面广告元素,监控爬取元素的数目,定时发送监控邮件
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- ‘‘‘ @xiayun @[email protected] #爬取网站内容,利用phantomjs:IP代理+修改UA+动态页面执行JS ‘‘‘ from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import time import urllib,urllib2 import smtplib import re from email.mime.text import MIMEText from email.header import Header import sys def reptile(): global result, data #proxy_ip.txt为IP代理池,可以自己爬IP,也可以买,不过都不稳定, #需要在前面再加一个IP验证程序。 IPS = [i for i in open("./proxy_ip.txt", ‘r‘).readline().split(‘\n‘) if i] print IPS for i in IPS: service_args = [] service_args = [‘--proxy-type=HTTP‘,] IP_str = ‘‘.join(i) print IP_str proxy_IP = ‘--proxy=%s‘ % IP_str service_args.append(proxy_IP) dcap = dict(DesiredCapabilities.PHANTOMJS) #创建UA头 dcap["phantomjs.page.settings.userAgent"] = (‘Mozilla/5.0 ([email protected]; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1‘) #利用phantomjs仿浏览器动作,参数2是代理IP driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=service_args) #设置访问超时时间 driver.implicitly_wait(60) driver.set_page_load_timeout(60) try: driver.get(‘网页地址‘) except: print "timeout" finally: data = driver.page_source time.sleep(20) req = r"广告元素" rule1 = re.compile(req) lists = re.findall(rule1, data) counts = len(lists) print counts # print data driver.quit() #判断广告元素是否为22 if counts == 22: print "The webpage is OK!" result = "The webpage is OK!Find 22 广告元素! proxy_IP:%s " % IP_str break if counts != 22: #IPS.remove(i) print "%s is bad!" % i.strip() result = "The webpage maybe bad" print "close" #返回结果和网页代码 return result, data def send_mail(result,data): receivers = [‘[email protected]‘] #接收人 mail_host = ‘smtp.exmail.qq.com‘ #代理邮箱smtp协议 mail_user = ‘[email protected]‘ #发送人 mail_pass = ‘xxxx‘ #密码 mail_postfix = ‘xxxx‘ #发件箱的后缀 title = str(result) msg = MIMEText(data, ‘plain‘, ‘utf-8‘) #文本格式内容 me = title.decode(‘utf-8‘) + "<" + mail_user + ">" msg[‘Subject‘] = Header(title, ‘utf-8‘) msg[‘From‘] = Header(me, ‘utf-8‘) msg[‘To‘] = Header(";".join(receivers), ‘utf-8‘) try: s = smtplib.SMTP() s.connect(mail_host) s.login(mail_user, mail_pass) s.sendmail(me,receivers , msg.as_string()) s.close() print "发送成功" return True except smtplib.SMTPException: print "Error: 无法发送邮件" return False if __name__ == ‘__main__‘: while 1: print ‘start‘ + ‘ ‘ + ‘‘.join(time.ctime(time.time())) result, data = reptile() send_mail(result=result, data=data) print ‘stop‘ + ‘ ‘ + ‘‘.join(time.ctime(time.time())) time.sleep(600) sys.exit(0)
本文出自 “echo xiayun” 博客,请务必保留此出处http://linuxerxy.blog.51cto.com/10707334/1893893
以上是关于python --selenium+phantomjs爬取动态页面广告源码的主要内容,如果未能解决你的问题,请参考以下文章