日常《中国统计年鉴》与《中国金融年鉴》数据表爬虫(附1985-2020所有Excel资源)

Posted 囚生CY

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了日常《中国统计年鉴》与《中国金融年鉴》数据表爬虫(附1985-2020所有Excel资源)相关的知识,希望对你有一定的参考价值。

序言

最近期末比较忙,挂个可能有用的资源:《中国金融年鉴》(1986-2019)和《中国统计年鉴》(1981-2020)的所有Excel表的资源。数据来源于中国知网的爬虫(下面正文中有提及具体链接网址)。目前为止网上还没有人提供完整的自1986年至今的年鉴数据,基本上只有特定年份的年鉴数据,而且还都是需要付费的。

链接:https://pan.baidu.com/s/13fjrInmjjxaNQRgS_Jv91w 
提取码:k5ir 

好了需要资源的上面自取即可,后记里的废话就不用看了。



1 《中国统计年鉴》与《中国金融年鉴》Excel数据爬虫

  • 鱼已经提供在上面了,下面是渔,不过笔者事先提醒,渔并不好学,建议自己去爬一遍就知道哪里比较坑了。
  • 不过第二部分里笔者也简要说明了一下爬虫的细节。

正在经历史上最难期末,放个历年《中国统计年鉴》和《中国金融年鉴》所有Excel表的爬虫脚本:

# -*- coding: utf-8 -*-
# @author: caoyang
# @email: caoyang@163.sufe.edu.cn

import os
import re
import time
import requests

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains

from bs4 import BeautifulSoup

def get_cookie(url):
	options = webdriver.FirefoxOptions()								
	options.add_argument("--headless")									
	driver = webdriver.Firefox(options=options)							
	driver.get(url)
	cookies = driver.get_cookies()
	driver.quit()
	def _cookie_to_string(cookies):
		string = ''
		for cookie in cookies:
			string += '=; '.format(cookie['name'], cookie['value'])
		return string.strip()
	return _cookie_to_string(cookies)
	

def download_chinese_statistical_yearbook(ybcode='N2020100004', year='2020', save_root='csyb', is_initial=True, ignore_caj=True):
	with open('system_csyb.log', 'w') as f:
		pass
	headers = 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'
	query_url = 'https://data.cnki.net/Yearbook/PartialGetCatalogResult'
	excel_url = 'https://data.cnki.net/'.format
	caj_url = 'https://data.cnki.net/download/GetCajUrl'
	regex = r'<[^>]+>'
	cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/?z=Z016'.format(ybcode))
	compiler = re.compile(regex, re.S)
	regular_interval = 15
	reset_interval = 300
	

	if not os.path.exists(save_root):
		os.mkdir(save_root)
	# year = ybcode[1:5]
	target_path = os.path.join(save_root, year)
	if not os.path.exists(target_path):
		os.mkdir(target_path)

	with open(os.path.join(target_path, 'log.txt'), 'w') as f:
		pass

	formdata = 
		'ybcode': ybcode,
		'entrycode': '',
		'page': '1',
		'pagerow': '20'
	
	response = requests.post(query_url, data=formdata, headers=headers)
	html = response.text
	soup = BeautifulSoup(html, 'lxml')
	span = soup.find('span', class_='s_p_listl')
	for link in span.find_all('a'):
		onclick = link.attrs.get('onclick')
		if onclick is not None:
			lindex = onclick.find('\\'')
			rindex = onclick.find('\\'', lindex + 1)
			n_pages = int(onclick[lindex + 1:rindex])
			break
	with open('system_csyb.log', 'a') as f:
		f.write('正在处理年...\\t\\n'.format(year, time.strftime('%Y-%m-%d %H:%M:%S')))
	print('正在处理年...'.format(year))
	with open('system_csyb.log', 'a') as f:
		f.write('共计页\\t\\n'.format(n_pages, time.strftime('%Y-%m-%d %H:%M:%S')))
	print('共计页'.format(n_pages))
	for page in range(1, n_pages + 1):
		with open('system_csyb.log', 'a') as f:
			f.write('  - 第页..\\t\\n'.format(page, time.strftime('%Y-%m-%d %H:%M:%S')))
		print('  - 第页..'.format(page))
		if not page == '1': 
			formdata = 
				'ybcode': ybcode,
				'entrycode': '',
				'page': str(page),
				'pagerow': '20'
			
			while True:
				try:
					response = requests.post(query_url, data=formdata, headers=headers)
					break
				except:
					with open('system_csyb.log', 'a') as f:
						f.write('    页面访问失败...\\t\\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
					print('    页面访问失败...')
					time.sleep(reset_interval)				
			html = response.text
			soup = BeautifulSoup(html, 'lxml')
		table = soup.find('table')
		for tr in table.find_all('tr'):
			tds = tr.find_all('td')
			assert len(tds) == 3
			title = compiler.sub('', str(tds[0])).replace('\\n', '').replace('\\t', '').replace(' ', '').replace('\\r', '')
			page_range = compiler.sub('', str(tds[1])).replace('\\n', '').replace('\\t', '').replace(' ', '')
			for _link in tds[2].find_all('a'):
				href = _link.attrs['href']
				if href.startswith('/download/excel'):					 # excel
					filecode = href[href.find('=')+1:]
					while True:
						_headers = headers.copy()
						_headers['Cookie'] = cookies
						try:
							with open('system_csyb.log', 'a') as f:
								f.write('    + 下载...\\t\\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
							print('    + 下载...'.format(title))
							response = requests.get(excel_url(href), headers=_headers)
							print('      ' + str(response.status_code))
							try:
								html = response.text
								soup = BeautifulSoup(html, 'lxml')
								if str(soup.find('title').string)=='中国经济社会大数据研究平台':
									with open('system_csyb.log', 'a') as f:
										f.write('      重置cookie...\\t\\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
									print('      重置cookie...')
									cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/?z=Z016'.format(ybcode))
								else:
									break
							except:
								break
						except:
							with open('system_csyb.log', 'a') as f:
								f.write('      失败...\\t\\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
							print('      失败...')
							time.sleep(reset_interval)
							cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/?z=Z016'.format(ybcode))
					time.sleep(regular_interval)
					
					with open(os.path.join(target_path, '.xls'.format(filecode)), 'wb') as f:
						f.write(response.content)
					with open(os.path.join(target_path, 'log.txt'), 'a') as f:
						f.write('\\t\\t.xls\\n'.format(title, page_range, filecode))
				else:													 # caj
					if ignore_caj:
						continue
					filecode = _link.attrs['fn']
					pagerange = _link.attrs['pg']
					disk = _link.attrs['disk']
					_formdata = 
						'filecode': filecode,
						'pagerange': pagerange,
						'disk': disk,
					
					while True:
						_headers = headers.copy()
						_headers['Cookie'] = cookies	
						try:		
							with open('system_csyb.log', 'a') as f:
								f.write('    + 下载的资源链接...\\t\\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
							print('    + 下载的资源链接...'.format(title))		
							response = requests.post(caj_url, headers=_headers, data=_formdata)
							break
						except:
							with open('system_csyb.log', 'a') as f:
								f.write('      失败...\\t\\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
							print('      失败...')
							time.sleep(reset_interval)		
							cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/?z=Z016'.format(ybcode))					
					resource_url = response.json()['url']
					while True:
						try:
							with open('system_csyb.log', 'a') as f:
								f.write('    + 下载...\\t\\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
							print('    + 下载...'.format(title))
							response = requests.get(resource_url, headers=headers)
							if str(response.status_code) == '200':
								break 
							else:
								with open('system_csyb.log', 'a') as f:
									f.write('      重置cookie...\\t\\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
								print('      重置cookie...')
								time.sleep(reset_interval)
								cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/?z=Z016'.format(ybcode))
						except:
							with open('system_csyb.log', 'a') as f:
								f.write('      失败...\\t\\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
							print('      失败...')
							time.sleep(regular_interval)
							cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/?z=Z016'.format(ybcode))
					time.sleep(regular_interval)
					with open(os.path.join(target_path, '.caj'.format(filecode)), 'wb') as f:
						f.write(response.content)
					with open(os.path.join(target_path, 'log.txt'), 'a') as f:
						f.write('\\t\\t.caj\\n'.format(title, page_range, filecode))

			
	# Find urls of year
	if is_initial:
		url = 'https://data.cnki.net/trade/Yearbook/Single/?z=Z016'.format(ybcode)
		response = requests.get(url, headers=headers)
		html = response.text
		soup = BeautifulSoup(html, 'lxml')
		div = soup.find('div', class_='s_year clearfix')
		links = []
		ybcodes = []
		for link in div.find_all('a'):
			class_ = link.attrs.get('class')
			if class_ is None:											 # not current
				href = link.attrs.get('href')
				ybcode = href.split('/')[-1].split('?')[0]
				links.append(href)
				ybcodes.append(ybcode)
		with open('ybcode_csyb.txt', 'w') as f:
			for ybcode in ybcodes:
				f.write(f'ybcode\\n')
		# for ybcode in ybcodes:
		#	download_chinese_statistical_yearbook(ybcode=ybcode, is_initial=False)
	

def download_chinese_financial_yearbook(ybcode='N2020070552', year='2019', save_root='cfyb', is_initial=True, ignore_caj=True):
	with open('system_cfyb.log', 'w') as f:
		pass
	headers = 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'
	query_url = 'https://data.cnki.net/Yearbook/PartialGetCatalogResult'
	excel_url = 'https://data.cnki.net/'.format
	caj_url = 'https://data.cnki.net/download/GetCajUrl'
	regex = r'<[^>]+>'
	cookies = '''ASP.NET_SessionId=qgfddbtpp2yw1yik5xpie3mo; Ecp_ClientId=2210524115702029814; Ecp_LoginStuts="IsAutoLogin":false,"UserName":"SH0013","ShowName":"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6","UserType":"bk","BUserName":"","BShowName":"","BUserType":"","r":"6dHmNy"; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtGdzBoZVNMWk5Nc0RUeDFBOD0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 12:16:44; LID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtGdzBoZVNMWk5Nc0RUeDFBOD0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 12:16:44; SID=009026; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621828625; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621828625'''
	cookies = '''ASP.NET_SessionId=pdbekustghjjz2neuam5etnt; Ecp_ClientId=5210524165003078186; Ecp_LoginStuts=\\"IsAutoLogin\\":false,\\"UserName\\":\\"SH0013\\",\\"ShowName\\":\\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\\",\\"UserType\\":\\"bk\\",\\"BUserName\\":\\"\\",\\"BShowName\\":\\"\\",\\"BUserType\\":\\"\\",\\"r\\":\\"087ZRr\\"; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqelcxUzhJV1VTdGVGdmpHd1JmTGx6Sjd5N1Yzcz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 17:09:30; LID=WEEvREcwSlJHSldSdmVqelcxUzhJV1VTdGVGdmpHd1JmTGx6Sjd5N1Yzcz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 17:09:30; SID=009024; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621846228; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621846228'''
	cookies = '''ASP.NET_SessionId=mow1jjxmf3yl0kudfyxajmzc; Ecp_ClientId=2210524182003926881; Ecp_LoginStuts=\\"IsAutoLogin\\":false,\\"UserName\\":\\"SH0013\\",\\"ShowName\\":\\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\\",\\"UserType\\":\\"bk\\",\\"BUserName\\":\\"\\",\\"BShowName\\":\\"\\",\\"BUserType\\":\\"\\",\\"r\\":\\"4ZXI5N\\"; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtnL01TODdZeGZBQjFVNFFhVT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 18:39:44; LID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtnL01TODdZeGZBQjFVNFFhVT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 18:39:44; SID=009026; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621851606; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621852178'''
	cookies = '''ASP.NET_SessionId=x2uuxyelllkb01vne0bg1fcz; Ecp_ClientId=1210524220405317104; Ecp_LoginStuts=\\"IsAutoLogin\\":false,\\"UserName\\":\\"SH0013\\",\\"ShowName\\":\\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\\",\\"UserType\\":\\"bk\\",\\"BUserName\\":\\"\\",\\"BShowName\\":\\"\\",\\"BUserType\\":\\"\\",\\"r\\":\\"CoZFit\\"; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqM1BLVW9SQVR4WDNESDFyZmdtZks1OWNYNFlMRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 22:23:34; LID=WEEvREcwSlJHSldSdmVqM1BLVW9SQVR4WDNESDFyZmdtZks1OWNYNFlMRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 22:23:34; SID=009025; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621865075; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621865075'''
	cookies = '''ASP.NET_SessionId=nl5mpjvzy2az5kamdhek0ydq; Ecp_ClientId=3210525133102568069; Ecp_LoginStuts=\\"IsAutoLogin\\":false,\\"UserName\\":\\"SH0013\\",\\"ShowName\\":\\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\\",\\"UserType\\":\\"bk\\",\\"BUserName\\":\\"\\",\\"BShowName\\":\\"\\",\\"BUserType\\":\\"\\",\\"r\\":\\"ubJVB4\\"; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqMVc3M1dGdk5Xa2hFYzh2WjV6Y2cvSUZzR3FPbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/25/2021 13:51:50; LID=WEEvREcwSlJHSldSdmVqMVc3M1dGdk5Xa2hFYzh2WjV6Y2cvSUZzR3FPbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-25 13:51:50; SID=009022; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621920712; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621920726'''
	cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/?z=Z016'.format(ybcode))
	compiler = re.compile(regex, re.S)
	regular_interval = 15
	reset_interval = 300
	

	if not os.path.exists(save_root):
		os.mkdir(save_root)
	# year = ybcode[1:5]
	target_path = os.path.join(save_root, year)
	if not os以上是关于日常《中国统计年鉴》与《中国金融年鉴》数据表爬虫(附1985-2020所有Excel资源)的主要内容,如果未能解决你的问题,请参考以下文章

《药物治疗年鉴》AOP中文版出版启动会在京召开

中国金融数据库优化公告

中国行政区划,道路等shapefile格式的GIS数据在哪里下载?还有中国人口数据,1km*1km栅格形式的。

出生率新低!1978-2020中国人口出生率死亡率及自然增长率变迁

数据可视化之地理坐标系

国内首个金融大数据性能测试平台即将发布