爬虫实战国家企业公示网-webapi实现
Posted ZSYL
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫实战国家企业公示网-webapi实现相关的知识,希望对你有一定的参考价值。
学习目标:
- 了解 quart框架
- 了解 各个接口的功能
1. quart框架
1.1 介绍
- quart是基于Asyncio的Python微框架。它志在让开发者能够在Web开发中很容易地得到Asyncio带来的好处。它对Flask应用的支持最好,它和Flask拥有相同的API
- 支持 HTTP/1.1,HTTP/2 和 Websockets
- 扩展性很强,并支持很多Flask的扩展
1.2 安装
pip install quart
1.3 文档
https://gitlab.com/pgjones/quart/
2. 代码实现
2.1 webapi.py
/gsxt/webapi.py
# THE WINTER IS COMING! the old driver will be driving who was a man of the world!
# -*- coding: utf-8 -*- python 3.6.7, create time is 18-11-9 下午7:27 GMT+8
import os
from uuid import uuid1
from base64 import b64encode
from redis import StrictRedis
from quart import (
Quart,
make_response,
request,
jsonify,
)
import static
redis = StrictRedis(decode_responses=True)
app = Quart(__name__)
app.config['JSON_AS_ASCII'] = False # 让jsonify()返回的json数据以utf8编码方式正常显示中文
@app.route('/', methods=['GET'])
async def index():
"""首页 接口说明"""
headers = {"Content-type": "text/plain; charset=UTF-8"}
resp = make_response((static.__doc__, 200, headers))
return await resp
@app.route('/company', methods=['GET'])
async def company():
"""接收任务参数,写入redis"""
company_name = request.args.get('company_name', None)
if company_name is None:
return jsonify({'status': 'failed', 'msg': '/company?company_name=缺少公司名称参数'})
token = uuid1()
# token = 'captcha_img' # 简单测试用
gsxt_task_json = {
'company_name': company_name,
'crack_captcha_mode': request.args.get('crack_captcha_mode', None) or '0',
'token': token,
'status': 'wait',
'msg': '新任务等待抓取'
}
try: # 向redis推任务
task_key = '{}:{}'.format(static.GSXT_TASK_TOPIC, token)
redis.hmset(task_key, gsxt_task_json)
redis.expire(task_key, 600) # 过期时间
redis.rpush(static.GSXT_TOKEN, token)
redis.expire(static.GSXT_TOKEN, 590)
except:
gsxt_task_json['msg'] = '接收任务失败,检查redis及配置'
return jsonify(gsxt_task_json)
gsxt_task_json['msg'] = '成功接收抓取任务'
return jsonify(gsxt_task_json)
@app.route('/crack_captcha', methods=['GET', 'POST'])
async def crack_captcha():
"""GET: 返回手动打码的html
POST: 接收手动输入的验证码信息"""
if request.method == 'GET':
token = request.args.get('token', None)
if token is None:
return jsonify({'status': 'failed', 'msg': '/crack_captcha?token=缺少token参数'})
img_file_path = './images/{}.jpg'.format(token) # 图片路径:以token命名图片
# try:
with open(img_file_path, 'rb') as f:
img_b64_str = b64encode(f.read()).decode() # 为了前端渲染展示:图片二进制字节流转base64字符串
# format(IMG_BASE64, CRACK_CAPTCHA_URL, TOKEN)
return static.img_html%{'IMG_BASE64':img_b64_str,
'CRACK_CAPTCHA_URL':'/crack_captcha',
'TOKEN':token}
# except:
# return jsonify({'status': 'failed', 'msg': '没有名为{}.jpg的图片,稍后重试或检查任务是否存在'.format(token)})
elif request.method == 'POST':
form_data = await request.form
captcha_params = form_data.get('captcha_params', None)
# print(captcha_params)
token = form_data.get('token', None)
# 将验证坐标存入redis
task_key = '{}:{}'.format(static.GSXT_TASK_TOPIC, token)
redis.hset(task_key, 'captcha_params', captcha_params)
# os.remove('./images/{}.jpg'.format(token)) # 删除图片
return jsonify({'token': token,
'result_url': '/result?token={}'.format(token),})
@app.route('/result')
async def result():
"""查询任务结果"""
token = request.args.get('token', None)
if token is None:
return jsonify({'status': 'failed', 'msg': '/crack_captcha?token=缺少token参数'})
task_key = '{}:{}'.format(static.GSXT_TASK_TOPIC, token)
result_dict = redis.hgetall(task_key)
return jsonify(result_dict)
if __name__ == '__main__':
app.run()
2.2 static.py
/gsxt/static.py
"""
国家企业公示网实时数据抓取demo组件说明
webapi.py 交互演示功能
任务参数push到redis消息总线中,
数据查询也从redis消息总线中获取
GET /
接口说明文档
GET /company
接收任务参数
params json = {
company_name: 公司名称,
crack_captcha_mode: 默认'0'(手动破解); '1'为调用打码平台破解,
}
return json = {
company_name: 公司名称,
token: 任务唯一识别码,
crack_captcha_url: 手动破解验证的url,
}
GET /crack_captcha
获取验证图片并返回手动破解验证码的html
params json = {
token: 任务唯一识别码,
}
return image in HTML
POST /crack_captcha
接收手动输入的验证码信息
params json = {
token: 任务唯一识别码,
captcha_params: 验证码所需参数,
}
return json = {
token: 任务唯一识别码,
result_url: 查询抓取结果的url,
}
GET /result
params json = {
token: 任务唯一识别码,
}
return gsxt_task:token
redis 消息总线及缓存
gsxt_token = [token1, token2, ...]
gsxt_task:token
{
company_name,
crack_captcha_mode,
captcha_params,
status, # 'wait', 任务没开始
# 'crawling', 抓取中
# 'failed', 失败
# 'done', 完成
msg,
data,
}
crawler.py
轮询redis
抓取数据
static.py
静态变量
配置文件
"""
img_html = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>spider_gsxt</title>
</head>
<body>
<div>
<img id="imgPIC" src="data:image/jpg;base64,%(IMG_BASE64)s" alt="" οnclick="Show(this)">
</div>
<div>
X:<input id="xxx" type="text" />
Y:<input id="yyy" type="text" />
<h3>按顺序依次点击图中的字符,并点击提交</h3>
<form action="%(CRACK_CAPTCHA_URL)s" method="post">
<input type="text" id="captcha_params" name="captcha_params">
<input type="hidden" value="%(TOKEN)s" name="token">
<input type="submit" value="提交">
</form>
</div>
</body>
<script language="javascript">
function mousePosition(ev){
if(ev.pageX || ev.pageY){
return {x:ev.pageX, y:ev.pageY};
}
return {
x:ev.clientX + document.body.scrollLeft - document.body.clientLeft,
y:ev.clientY + document.body.scrollTop - document.body.clientTop
};
}
function mouseMove(ev){
ev = ev || window.event;
var mousePos = mousePosition(ev);
document.getElementById('xxx').value = mousePos.x;
document.getElementById('yyy').value = mousePos.y;
}
document.onmousemove = mouseMove;
function Show(el){
var x = parseInt(document.getElementById('xxx').value)-el.offsetLeft;
var y = parseInt(document.getElementById('yyy').value)-el.offsetTop;
var captcha_params = document.getElementById('captcha_params').value;
document.getElementById('captcha_params').value = captcha_params + x + "," + y + ",";
}
</script>
<style>
body{
margin: 0;
padding: 0;
}
</style>
</html>"""
# 配置
GSXT_TOKEN = 'gsxt_token'
GSXT_TASK_TOPIC = 'gsxt_task'
小结
- 了解 quart框架
- 了解 各个接口的功能
以上是关于爬虫实战国家企业公示网-webapi实现的主要内容,如果未能解决你的问题,请参考以下文章