json ELK DEBUG SCRAPING
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了json ELK DEBUG SCRAPING相关的知识,希望对你有一定的参考价值。
# PRODUCT {
#By url
POST pimalion.scraping.prod.1086.dzjemproduct/_search
{
"query": {
"wildcard": {
"url.keyword": {
"value": "https://www.kstools.com/fr/produits/katalog/outillage-poids-lourd/transmission*"
}
}
}
}
# END PRODUCT }
#######################################################################
# WEBPAGEMODEL {
#By URL
POST pimalion.scraping.prod.1086.webpagemodel/_search
{
"query": {
"bool": {
"must": [
{
"bool": {
"must": [
{
"term": {
"agentId.keyword": {
"value": "AV-MnFNO5qKIJIcYX151"
}
}
}
]
}
}
]
}
},
"size": 1,
"_source": {},
"aggs": {
"NAME": {
"terms": {
"field": "url.keyword",
"size": 10000
}
}
}
}
#By Agent
POST pimalion.scraping.prod.1086.webpagemodel/_search
{
"size": 0,
"aggs": {
"NAME": {
"terms": {
"field": "url.keyword",
"size": 10
}
}
},
"query": {
"term": {
"agentId.keyword": {
"value": "AV-MnFNO5qKIJIcYX15v"
}
}
}
}
#By word
POST pimalion.scraping.prod.1086.webpagemodel/_search?q=*easysite*
{
"size": 0,
"aggs": {
"NAME": {
"terms": {
"field": "siteDomain.keyword",
"size": 1000
}
}
}
}
#By AgentId
POST pimalion.scraping.prod.1086.webpagemodel/_search
{
"aggs": {
"NAME": {
"terms": {
"field": "url.keyword",
"size": 1000
}
}
},
"size": 0,
"query": {
"term": {
"agentId.keyword": {
"value": "boschPro2"
}
}
}
}
# All sites
POST pimalion.scraping.prod.1086.webpagemodel/_search
{
"size": 0,
"aggs": {
"NAME": {
"terms": {
"field": "siteDomain.keyword",
"size": 100,
"include": ".*schn.*"
}
}
}
}
# By domain
POST pimalion.scraping.prod.1086.webpagemodel/_search
{
"aggs": {
"NAME": {
"terms": {
"field": "httpCode",
"size": 1000
}
}
},
"_source": {
"excludes": "html"
},
"size": 0,
"query": {
"term": {
"siteDomain.keyword": {
"value": "https://www.schneider-electric.fr"
}
}
}
}
#}
#######################################################################
# MAINTENANCE {
#Unblocks the index if entered read only mode
PUT pimalion.scraping.*/_settings
{
"index": {
"blocks": {
"read_only_allow_delete": "false"
}
}
}
# MAINTENANCE }
#######################################################################
# TEMP {
- IsScrapable à null partout
- tout ceux qui ont un point dans le nom passent en analyse true
- tout ceux qui ont un status non scrapable dans l'analyse passe à false sur isScrapable et false sur analyse
- tout ceux qui ont isScraped true passent
- tout ceux qui ont hasBeenExtraced true
- tout ceux qui ont hasAgent true passent
#isScrapable, hasAnalysis, hasScrapping,hasExtraction, isScraped, hasAgent
POST pimalion.scraping.prod.1086.productagent/_search
{
"size": 1,
"aggs": {
"NAME": {
"terms": {
"field": "agentName.keyword",
"size": 1000
}
}
},
"query": {
"bool": {
"must": [
{
"term": {
"_id": {
"value": "AV-MnFNO5qKIJIcYX15Z"
}
}
}
]
}
}
}
hasAnalysis
hasScrapping
hasExtraction
isScrapable
POST pimalion.scraping.prod.1086.productagent/_update_by_query
{
"query": {
"bool": {
"must": [
{
"term": {
"hasBeenExtraced": {
"value": "true"
}
}
}
]
}
},
"script": {
"source": "ctx._source.hasExtraction = true",
"lang": "painless"
}
}
# TEMP }
以上是关于json ELK DEBUG SCRAPING的主要内容,如果未能解决你的问题,请参考以下文章
Web Scraping 代码中的 JSON 错误,如何修复?
Mechanize Rails - Web Scraping - 服务器使用JSON进行响应 - 如何将URL解析为下载CSV