json ELK DEBUG SCRAPING

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了json ELK DEBUG SCRAPING相关的知识,希望对你有一定的参考价值。

# PRODUCT {

#By url
POST pimalion.scraping.prod.1086.dzjemproduct/_search
{
  "query": {
    "wildcard": {
      "url.keyword": {
        "value": "https://www.kstools.com/fr/produits/katalog/outillage-poids-lourd/transmission*"
      }
    }
  }
}

# END PRODUCT }

#######################################################################

# WEBPAGEMODEL {
  
#By URL
POST pimalion.scraping.prod.1086.webpagemodel/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "bool": {
            "must": [
              {
                "term": {
                  "agentId.keyword": {
                    "value": "AV-MnFNO5qKIJIcYX151"
                  }
                }
              }
            ]
          }
        }
      ]
    }
  },
  "size": 1,
  "_source": {},
  "aggs": {
    "NAME": {
      "terms": {
        "field": "url.keyword",
        "size": 10000
      }
    }
  }
}

#By Agent
POST pimalion.scraping.prod.1086.webpagemodel/_search
{
  "size": 0, 
  "aggs": {
    "NAME": {
      "terms": {
        "field": "url.keyword",
        "size": 10
      }
    }
  }, 
  "query": {
    "term": {
      "agentId.keyword": {
        "value": "AV-MnFNO5qKIJIcYX15v"
      }
    }
  }
}

#By word
POST pimalion.scraping.prod.1086.webpagemodel/_search?q=*easysite*
{
"size": 0, 
  "aggs": {
    "NAME": {
      "terms": {
        "field": "siteDomain.keyword",
        "size": 1000
      }
    }
  }
}

#By AgentId
POST pimalion.scraping.prod.1086.webpagemodel/_search
{
  "aggs": {
    "NAME": {
      "terms": {
        "field": "url.keyword",
        "size": 1000
      }
    }
  }, 
  "size": 0, 
  "query": {
    "term": {
      "agentId.keyword": {
        "value": "boschPro2"
      }
    }
  }
}

# All sites
POST pimalion.scraping.prod.1086.webpagemodel/_search
{
  "size": 0,
  "aggs": {
    "NAME": {
      "terms": {
        "field": "siteDomain.keyword",
        "size": 100,
        "include": ".*schn.*"
      }
    }
  }
}

# By domain
POST pimalion.scraping.prod.1086.webpagemodel/_search
{
  "aggs": {
    "NAME": {
      "terms": {
        "field": "httpCode",
        "size": 1000
      }
    }
  }, 
  "_source": {
    "excludes": "html"
  },
  "size": 0,
  "query": {
    "term": {
      "siteDomain.keyword": {
        "value": "https://www.schneider-electric.fr"
      }
    }
  }
}

#}

#######################################################################

# MAINTENANCE {

#Unblocks the index if entered read only mode
PUT pimalion.scraping.*/_settings
{
 "index": {
   "blocks": {
     "read_only_allow_delete": "false"
    }
  }
}
  
# MAINTENANCE }

#######################################################################

# TEMP {
  
  - IsScrapable à null partout
- tout ceux qui ont un point dans le nom passent en analyse true
- tout ceux qui ont un status non scrapable dans l'analyse passe à false sur isScrapable et false sur analyse
- tout ceux qui ont isScraped true passent
- tout ceux qui ont hasBeenExtraced true
- tout ceux qui ont hasAgent true passent

#isScrapable, hasAnalysis, hasScrapping,hasExtraction, isScraped, hasAgent
POST pimalion.scraping.prod.1086.productagent/_search
{
  "size": 1, 
  "aggs": {
    "NAME": {
      "terms": {
        "field": "agentName.keyword",
        "size": 1000
      }
    }
  }, 
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "_id": {
              "value": "AV-MnFNO5qKIJIcYX15Z"
            }
          }
        }
      ]
    }
  }
}

hasAnalysis
hasScrapping
hasExtraction
isScrapable


POST pimalion.scraping.prod.1086.productagent/_update_by_query
{
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "hasBeenExtraced": {
              "value": "true"
            }
          }
        }
      ]
    }
  },
    "script": {
    "source": "ctx._source.hasExtraction = true",
    "lang": "painless"
  }
}

  
# TEMP }

以上是关于json ELK DEBUG SCRAPING的主要内容,如果未能解决你的问题,请参考以下文章

Web Scraping 代码中的 JSON 错误,如何修复?

Mechanize Rails - Web Scraping - 服务器使用JSON进行响应 - 如何将URL解析为下载CSV

ELK Kafka json to elk

json ELK SCRIPTED TERM AGG

json 维修SOCODA ELK

json ELK UPDATE DOC