日常《中国统计年鉴》与《中国金融年鉴》数据表爬虫(附1985-2020所有Excel资源)
Posted 囚生CY
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了日常《中国统计年鉴》与《中国金融年鉴》数据表爬虫(附1985-2020所有Excel资源)相关的知识,希望对你有一定的参考价值。
序言
最近期末比较忙,挂个可能有用的资源:《中国金融年鉴》(1986-2019)和《中国统计年鉴》(1981-2020)的所有Excel表的资源。数据来源于中国知网的爬虫(下面正文中有提及具体链接网址)。目前为止网上还没有人提供完整的自1986年至今的年鉴数据,基本上只有特定年份的年鉴数据,而且还都是需要付费的。
链接:https://pan.baidu.com/s/13fjrInmjjxaNQRgS_Jv91w
提取码:k5ir
好了需要资源的上面自取即可,后记里的废话就不用看了。
1 《中国统计年鉴》与《中国金融年鉴》Excel数据爬虫
- 鱼已经提供在上面了,下面是渔,不过笔者事先提醒,渔并不好学,建议自己去爬一遍就知道哪里比较坑了。
- 不过第二部分里笔者也简要说明了一下爬虫的细节。
正在经历史上最难期末,放个历年《中国统计年鉴》和《中国金融年鉴》所有Excel表的爬虫脚本:
# -*- coding: utf-8 -*-
# @author: caoyang
# @email: caoyang@163.sufe.edu.cn
import os
import re
import time
import requests
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
def get_cookie(url):
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)
driver.get(url)
cookies = driver.get_cookies()
driver.quit()
def _cookie_to_string(cookies):
string = ''
for cookie in cookies:
string += '{}={}; '.format(cookie['name'], cookie['value'])
return string.strip()
return _cookie_to_string(cookies)
def download_chinese_statistical_yearbook(ybcode='N2020100004', year='2020', save_root='csyb', is_initial=True, ignore_caj=True):
with open('system_csyb.log', 'w') as f:
pass
headers = {'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'}
query_url = 'https://data.cnki.net/Yearbook/PartialGetCatalogResult'
excel_url = 'https://data.cnki.net/{}'.format
caj_url = 'https://data.cnki.net/download/GetCajUrl'
regex = r'<[^>]+>'
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
compiler = re.compile(regex, re.S)
regular_interval = 15
reset_interval = 300
if not os.path.exists(save_root):
os.mkdir(save_root)
# year = ybcode[1:5]
target_path = os.path.join(save_root, year)
if not os.path.exists(target_path):
os.mkdir(target_path)
with open(os.path.join(target_path, 'log.txt'), 'w') as f:
pass
formdata = {
'ybcode': ybcode,
'entrycode': '',
'page': '1',
'pagerow': '20'
}
response = requests.post(query_url, data=formdata, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'lxml')
span = soup.find('span', class_='s_p_listl')
for link in span.find_all('a'):
onclick = link.attrs.get('onclick')
if onclick is not None:
lindex = onclick.find('\\'')
rindex = onclick.find('\\'', lindex + 1)
n_pages = int(onclick[lindex + 1:rindex])
break
with open('system_csyb.log', 'a') as f:
f.write('正在处理{}年...\\t{}\\n'.format(year, time.strftime('%Y-%m-%d %H:%M:%S')))
print('正在处理{}年...'.format(year))
with open('system_csyb.log', 'a') as f:
f.write('共计{}页\\t{}\\n'.format(n_pages, time.strftime('%Y-%m-%d %H:%M:%S')))
print('共计{}页'.format(n_pages))
for page in range(1, n_pages + 1):
with open('system_csyb.log', 'a') as f:
f.write(' - 第{}页..\\t{}\\n'.format(page, time.strftime('%Y-%m-%d %H:%M:%S')))
print(' - 第{}页..'.format(page))
if not page == '1':
formdata = {
'ybcode': ybcode,
'entrycode': '',
'page': str(page),
'pagerow': '20'
}
while True:
try:
response = requests.post(query_url, data=formdata, headers=headers)
break
except:
with open('system_csyb.log', 'a') as f:
f.write(' 页面访问失败...\\t{}\\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 页面访问失败...')
time.sleep(reset_interval)
html = response.text
soup = BeautifulSoup(html, 'lxml')
table = soup.find('table')
for tr in table.find_all('tr'):
tds = tr.find_all('td')
assert len(tds) == 3
title = compiler.sub('', str(tds[0])).replace('\\n', '').replace('\\t', '').replace(' ', '').replace('\\r', '')
page_range = compiler.sub('', str(tds[1])).replace('\\n', '').replace('\\t', '').replace(' ', '')
for _link in tds[2].find_all('a'):
href = _link.attrs['href']
if href.startswith('/download/excel'): # excel
filecode = href[href.find('=')+1:]
while True:
_headers = headers.copy()
_headers['Cookie'] = cookies
try:
with open('system_csyb.log', 'a') as f:
f.write(' + 下载{}...\\t{}\\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
print(' + 下载{}...'.format(title))
response = requests.get(excel_url(href), headers=_headers)
print(' ' + str(response.status_code))
try:
html = response.text
soup = BeautifulSoup(html, 'lxml')
if str(soup.find('title').string)=='中国经济社会大数据研究平台':
with open('system_csyb.log', 'a') as f:
f.write(' 重置cookie...\\t{}\\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 重置cookie...')
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
else:
break
except:
break
except:
with open('system_csyb.log', 'a') as f:
f.write(' 失败...\\t{}\\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 失败...')
time.sleep(reset_interval)
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
time.sleep(regular_interval)
with open(os.path.join(target_path, '{}.xls'.format(filecode)), 'wb') as f:
f.write(response.content)
with open(os.path.join(target_path, 'log.txt'), 'a') as f:
f.write('{}\\t{}\\t{}.xls\\n'.format(title, page_range, filecode))
else: # caj
if ignore_caj:
continue
filecode = _link.attrs['fn']
pagerange = _link.attrs['pg']
disk = _link.attrs['disk']
_formdata = {
'filecode': filecode,
'pagerange': pagerange,
'disk': disk,
}
while True:
_headers = headers.copy()
_headers['Cookie'] = cookies
try:
with open('system_csyb.log', 'a') as f:
f.write(' + 下载{}的资源链接...\\t{}\\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
print(' + 下载{}的资源链接...'.format(title))
response = requests.post(caj_url, headers=_headers, data=_formdata)
break
except:
with open('system_csyb.log', 'a') as f:
f.write(' 失败...\\t{}\\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 失败...')
time.sleep(reset_interval)
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
resource_url = response.json()['url']
while True:
try:
with open('system_csyb.log', 'a') as f:
f.write(' + 下载{}...\\t{}\\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
print(' + 下载{}...'.format(title))
response = requests.get(resource_url, headers=headers)
if str(response.status_code) == '200':
break
else:
with open('system_csyb.log', 'a') as f:
f.write(' 重置cookie...\\t{}\\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 重置cookie...')
time.sleep(reset_interval)
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
except:
with open('system_csyb.log', 'a') as f:
f.write(' 失败...\\t{}\\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 失败...')
time.sleep(regular_interval)
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
time.sleep(regular_interval)
with open(os.path.join(target_path, '{}.caj'.format(filecode)), 'wb') as f:
f.write(response.content)
with open(os.path.join(target_path, 'log.txt'), 'a') as f:
f.write('{}\\t{}\\t{}.caj\\n'.format(title, page_range, filecode))
# Find urls of year
if is_initial:
url = 'https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)
response = requests.get(url, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'lxml')
div = soup.find('div', class_='s_year clearfix')
links = []
ybcodes = []
for link in div.find_all('a'):
class_ = link.attrs.get('class')
if class_ is None: # not current
href = link.attrs.get('href')
ybcode = href.split('/')[-1].split('?')[0]
links.append(href)
ybcodes.append(ybcode)
with open('ybcode_csyb.txt', 'w') as f:
for ybcode in ybcodes:
f.write(f'{ybcode}\\n')
# for ybcode in ybcodes:
# download_chinese_statistical_yearbook(ybcode=ybcode, is_initial=False)
def download_chinese_financial_yearbook(ybcode='N2020070552', year='2019', save_root='cfyb', is_initial=True, ignore_caj=True):
with open('system_cfyb.log', 'w') as f:
pass
headers = {'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'}
query_url = 'https://data.cnki.net/Yearbook/PartialGetCatalogResult'
excel_url = 'https://data.cnki.net/{}'.format
caj_url = 'https://data.cnki.net/download/GetCajUrl'
regex = r'<[^>]+>'
cookies = '''ASP.NET_SessionId=qgfddbtpp2yw1yik5xpie3mo; Ecp_ClientId=2210524115702029814; Ecp_LoginStuts={"IsAutoLogin":false,"UserName":"SH0013","ShowName":"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6","UserType":"bk","BUserName":"","BShowName":"","BUserType":"","r":"6dHmNy"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtGdzBoZVNMWk5Nc0RUeDFBOD0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 12:16:44; LID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtGdzBoZVNMWk5Nc0RUeDFBOD0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 12:16:44; SID=009026; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621828625; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621828625'''
cookies = '''ASP.NET_SessionId=pdbekustghjjz2neuam5etnt; Ecp_ClientId=5210524165003078186; Ecp_LoginStuts={\\"IsAutoLogin\\":false,\\"UserName\\":\\"SH0013\\",\\"ShowName\\":\\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\\",\\"UserType\\":\\"bk\\",\\"BUserName\\":\\"\\",\\"BShowName\\":\\"\\",\\"BUserType\\":\\"\\",\\"r\\":\\"087ZRr\\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqelcxUzhJV1VTdGVGdmpHd1JmTGx6Sjd5N1Yzcz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 17:09:30; LID=WEEvREcwSlJHSldSdmVqelcxUzhJV1VTdGVGdmpHd1JmTGx6Sjd5N1Yzcz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 17:09:30; SID=009024; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621846228; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621846228'''
cookies = '''ASP.NET_SessionId=mow1jjxmf3yl0kudfyxajmzc; Ecp_ClientId=2210524182003926881; Ecp_LoginStuts={\\"IsAutoLogin\\":false,\\"UserName\\":\\"SH0013\\",\\"ShowName\\":\\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\\",\\"UserType\\":\\"bk\\",\\"BUserName\\":\\"\\",\\"BShowName\\":\\"\\",\\"BUserType\\":\\"\\",\\"r\\":\\"4ZXI5N\\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtnL01TODdZeGZBQjFVNFFhVT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 18:39:44; LID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtnL01TODdZeGZBQjFVNFFhVT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 18:39:44; SID=009026; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621851606; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621852178'''
cookies = '''ASP.NET_SessionId=x2uuxyelllkb01vne0bg1fcz; Ecp_ClientId=1210524220405317104; Ecp_LoginStuts={\\"IsAutoLogin\\":false,\\"UserName\\":\\"SH0013\\",\\"ShowName\\":\\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\\",\\"UserType\\":\\"bk\\",\\"BUserName\\":\\"\\",\\"BShowName\\":\\"\\",\\"BUserType\\":\\"\\",\\"r\\":\\"CoZFit\\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqM1BLVW9SQVR4WDNESDFyZmdtZks1OWNYNFlMRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 22:23:34; LID=WEEvREcwSlJHSldSdmVqM1BLVW9SQVR4WDNESDFyZmdtZks1OWNYNFlMRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 22:23:34; SID=009025; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621865075; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621865075'''
cookies = '''ASP.NET_SessionId=nl5mpjvzy2az5kamdhek0ydq; Ecp_ClientId=3210525133102568069; Ecp_LoginStuts={\\"IsAutoLogin\\":false,\\"UserName\\":\\"SH0013\\",\\"ShowName\\":\\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\\",\\"UserType\\":\\"bk\\",\\"BUserName\\":\\"\\",\\"BShowName\\":\\"\\",\\"BUserType\\":\\"\\",\\"r\\":\\"ubJVB4\\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqMVc3M1dGdk5Xa2hFYzh2WjV6Y2cvSUZzR3FPbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/25/2021 13:51:50; LID=WEEvREcwSlJHSldSdmVqMVc3M1dGdk5Xa2hFYzh2WjV6Y2cvSUZzR3FPbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-25 13:51:50; SID=009022; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621920712; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621920726'''
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
compiler = re.compile(regex, re.S)
regular_interval = 15
reset_interval = 300
if not os.path.exists(save_root):
os.mkdir(save_root)
# year = ybcode[1:5]
target_path = os.path.join(save_root, year)
if not os以上是关于日常《中国统计年鉴》与《中国金融年鉴》数据表爬虫(附1985-2020所有Excel资源)的主要内容,如果未能解决你的问题,请参考以下文章
中国行政区划,道路等shapefile格式的GIS数据在哪里下载?还有中国人口数据,1km*1km栅格形式的。