厉害了!使用Python神不知鬼不觉爬取公司内部的ppt资料(勿做商业用途!)
Posted huohuohuo1
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了厉害了!使用Python神不知鬼不觉爬取公司内部的ppt资料(勿做商业用途!)相关的知识,希望对你有一定的参考价值。
在写爬虫的过程中遇到如下错误:
1 WinError 10061 - No Connection Could be made
解决方法:
1. 打开IE internet options 2. Connections -> Lan Setting 3. 勾上automatically detect settings
封装好的db操作
1 # -*- coding:utf-8 -*- 2 #__author__ = ‘ecaoyng‘ 3 4 import pymysql 5 import time 6 7 class DBOperation: 8 9 def __init__(self, tb_name): 10 self.db_host = ‘x‘ 11 self.db_port = 3306 12 self.db_user = ‘x‘ 13 self.db_pwd = ‘x‘ 14 self.db_name = ‘x‘ 15 self.tb_name = tb_name 16 17 def get_time(self): 18 now_time=time.strftime(‘%Y-%m-%d %H:%M:%S‘, time.localtime(time.time())) 19 return now_time 20 ‘‘‘ 21 set up connection with db 22 ‘‘‘ 23 def db_conn(self): 24 exec_time = self.get_time() 25 try: 26 conn = pymysql.connect(host=self.db_host,port=self.db_port, 27 user=self.db_user,passwd=self.db_pwd,db=self.db_name) 28 return conn 29 except Exception as e: 30 print((u‘[%s]: Errors during db connection:%s‘ % (exec_time, e))) 31 return None 32 ‘‘‘ 33 set up cursor 34 ‘‘‘ 35 def db_cursor(self, conn): 36 try: 37 cur = conn.cursor() 38 return cur 39 except Exception as e: 40 print(e) 41 return None 42 43 ‘‘‘ 44 db close 45 ‘‘‘ 46 def db_close(self,cur,conn): 47 exec_time = self.get_time() 48 cur.close() 49 conn.close() 50 print(u‘[%s]: db closed‘ % exec_time) 51 52 53 54 ‘‘‘ 55 db operations 56 ‘‘‘ 57 def tb_insert_url(self,cur,conn,urls): 58 exec_time = self.get_time() 59 tb_exist_sql = """CREATE TABLE IF NOT EXISTS """+ self.tb_name + """ ( 60 URL VARCHAR(200) NOT NULL 61 )""" 62 try: 63 cur.execute(tb_exist_sql) 64 print(u‘[%s]: try to create table %s if not exists.‘ % (exec_time, self.tb_name)) 65 conn.commit() 66 67 sql_insert_url = ‘INSERT INTO ‘ + self.tb_name +‘ VALUES (%s)‘ 68 cur.executemany(sql_insert_url,urls) 69 conn.commit() 70 except Exception as e: 71 print(u‘[%s]: Errors during insert into %s:%s‘ % (exec_time, self.tb_name ,e)) 72 73 74 if __name__ == ‘__main__‘: 75 76 db=DBOperation(‘ECNSlides‘) 77 db_conn = db.db_conn() 78 db_cur = db.db_cursor(db_conn) 79 db.db_close(db_cur,db_conn)
下面是爬虫程序:
1 # -*- coding:utf-8 -*- 2 #__author__ = ‘ecaoyng‘ 3 4 from ESlides.src.DBOperation import * 5 import urllib.request 6 import re 7 import time 8 9 10 class ESlidesCrawler: 11 def __init__(self): 12 self.target_link=‘https://mediabank.ericsson.net/search/slides/group%20function%20%28gf%29‘ 13 self.user_agent = ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘ 14 self.user_headers = { 15 ‘User-Agent‘: self.user_agent, 16 ‘Accept‘ : ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘, 17 ‘Accept - Encoding‘ : ‘gzip, deflate, br‘, 18 ‘Accept-Language‘ : ‘zh-CN,zh;q=0.8‘, 19 ‘Cookie‘ : ‘PHPSESSID=57i0onm69eei46g6g23ek05tj2‘, 20 ‘Host‘ : ‘mediabank.ericsson.net‘, 21 ‘Referer‘ : ‘https://mediabank.ericsson.net/‘ 22 23 } 24 self.save_dir = ‘C:/Users/ecaoyng/Desktop/PPT/‘ 25 26 ‘‘‘ 27 get local time 28 ‘‘‘ 29 def get_time(self): 30 now_time=time.strftime(‘%Y-%m-%d %H:%M:%S‘, time.localtime(time.time())) 31 return now_time 32 ‘‘‘ 33 get page links 34 ‘‘‘ 35 def get_page(self): 36 now_time=self.get_time() 37 try: 38 request = urllib.request.Request(self.target_link, headers=self.user_headers) 39 response = urllib.request.urlopen(request) 40 pageCode = response.read().decode(‘utf-8‘) 41 return pageCode 42 except urllib.request.URLError as e: 43 print(u‘%s Errors during connect to target link:%s‘ % (now_time, e)) 44 return None 45 ‘‘‘ 46 get initial target links 47 ‘‘‘ 48 def get_links(self): 49 now_time = self.get_time() 50 page_code = self.get_page() 51 if page_code is not None: 52 page_links = [] 53 try: 54 pattern = re.compile( 55 ‘<li id=.*?>.*?<a href="/media/(.*?)" class="thumb" draggable="true">‘,re.S) 56 items = re.findall(pattern, page_code) 57 for item in items: 58 item = ‘%s%s%s‘ % (‘https://mediabank.ericsson.net/details/‘, item, ‘/download/original‘) 59 page_links.append(item) 60 return page_links 61 except Exception as e: 62 print(u‘[%s]: Errors during parser target link:%s‘ % (now_time, e)) 63 return None 64 else: 65 print(‘page code returns none‘) 66 return None 67 ‘‘‘ 68 save links into database 69 ‘‘‘ 70 def save_links(self): 71 now_time = self.get_time() 72 links=self.get_links() 73 print(links) 74 try: 75 if links is not None: 76 db = DBOperation(‘ECNSlides‘) 77 db_conn = db.db_conn() 78 db_cur = db.db_cursor(db_conn) 79 print(u‘[%s]: start to urls insert to db‘ % now_time) 80 db.tb_insert_url(db_cur, db_conn, links) 81 print(u‘[%s]: write urls insert to db successfully‘ % now_time) 82 else: 83 print(u‘[%s]: URL is None when insert to db‘ % now_time) 84 pass 85 finally: 86 db.db_close(db_cur, db_conn) 87 88 ‘‘‘ 89 download ECN slides with params by http 90 ‘‘‘ 91 def slides_download_params(self): 92 93 links = self.get_links() 94 try: 95 for url in links: 96 now_time = self.get_time() 97 file_pattern = re.compile( 98 ‘.*?/(\d+)/download/original$‘,re.S) 99 file_name = re.findall(file_pattern, url) 100 file_path = self.save_dir + ‘‘.join(file_name) + ‘.pptx‘ 101 102 print(‘Downloading to %s ...‘ % file_path) 103 104 save_file = open(file_path,‘wb‘) 105 save_file.write(urllib.request.urlopen(url).read()) 106 save_file.close() 107 108 109 # with urllib.request.urlopen(url) as slide: 110 # with open(file_path, ‘wb‘) as outfile: 111 # outfile.write(slide.read()) 112 # 113 # break 114 except Exception as e: 115 print(u‘[%s]: Errors during download slides: %s.‘ % (now_time,e)) 116 117 118 ‘‘‘ 119 download ECN slides with remote db 120 ‘‘‘ 121 def slides_download_db(self): 122 pass 123 124 125 if __name__ == ‘__main__‘: 126 crawler=ESlidesCrawler() 127 # crawler.save_links() 128 crawler.slides_download_params()
问题出现了,发现在http中敲入下载地址,类似于
https://mediabank.ericsson.net/details/Organization%20simple/83138/download/original
但是python代码中用这个地址返回的不是pptx文件,而是html文件.
要知道具体返回的是什么文件的方法如下:
1 # reobj=urllib.request.urlopen(url) 2 # print(type(reobj)) 3 # print(reobj.info()) 4 # print(reobj.getcode())
可以看到正常如果下载的是zip文件,则返回的信息如下:
1 Content-Type: application/x-zip-compressed 2 Last-Modified: Mon, 23 May 2016 07:50:56 GMT 3 Accept-Ranges: bytes 4 ETag: "0f075d6c7b4d11:0" 5 Server: Microsoft-IIS/7.5 6 X-Powered-By: ASP.NET 7 Date: Wed, 29 Nov 2017 07:07:27 GMT 8 Connection: close 9 Content-Length: 55712699
但是本来是ppt文件,却下载了
1 Cache-Control: no-cache 2 Pragma: no-cache 3 Content-Length: 11743 4 Content-Type: text/html 5 Expires: Wed, 29 Nov 2017 07:04:04 GMT 6 Server: Microsoft-IIS/8.0 7 Set-Cookie: SMTargetSession=HTTPS%3A%2F%2Ffss%2Eericsson%2Ecom%2Fsiteminderagent%2Fredirectjsp%2Fredirect%2Dinternal%2Ejsp%3FSPID%3DMediabankIntern%26RelayState%3Dhttps%253A%252F%252Fmediabank%2Eericsson%2Enet%252Fdetails%252FOrganization%252520simple%252F83138%252Fdownload%252Foriginal%26SMPORTALURL%3Dhttps%253A%252F%252Ffss%2Eericsson%2Ecom%252Faffwebservices%252Fpublic%252Fsaml2sso%26SAMLTRANSACTIONID%3D176beb36%2Dfeb953b6%2D9a53d42e%2D58810506%2D087b72ac%2Da4e3; path=/ 8 Set-Cookie: ASPSESSIONIDACATSTTS=FOLBNEGCIBMFCPILNEMHOHFN; path=/ 9 X-Powered-By: ASP.NET 10 X-WAM-LOC: LP2-2 11 Date: Wed, 29 Nov 2017 07:05:04 GMT 12 Connection: close 13 Set-Cookie: BIGipServerWAM_PRD_Login=rd423o00000000000000000000ffff9958f466o50001; path=/
Content-Type: text/html 说明是html文件。将其打开之后发现是公司的安全认证页面.
于是开始思索是否可以用cookie的方式来抓取.
以上是关于厉害了!使用Python神不知鬼不觉爬取公司内部的ppt资料(勿做商业用途!)的主要内容,如果未能解决你的问题,请参考以下文章
Python爬取6271家死亡公司数据,看十年创业公司消亡史