markdown Node Crawler用于查找站点上的所有域链接并对其运行功能

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了markdown Node Crawler用于查找站点上的所有域链接并对其运行功能相关的知识,希望对你有一定的参考价值。

{
  "name": "crawling-link-matcher",
  "version": "0.1.0",
  "dependencies": {
    "crawl": "0.1.0",
    "qs": "0.5.2",
    "underscore": "1.4.2",
    "ent": "0.0.4"
  }
}
/*global console process require */

var crawler = require('./node_modules/crawl/lib/crawler'),
    _ = require('underscore'),
    url = require('url'),
    qs = require('qs'),
    ent = require('ent'),
    startUrl = process.argv[2],
    urlPath = process.argv[3] || '',
    parsedStartUrl = url.parse(startUrl);

crawler.crawl(startUrl, { headers: false, body: false }, function(err, pages) {

  if (err) {
    console.log("An error occured: " + err);
    process.exit(1);
  }

  var // An array of unique urls within the site, falsy values removed
      allLinks = _.uniq(_.compact(_.flatten(_.pluck(pages, 'links')))),

      // Internal links: check if it contains our original host or is relative
      // if it is relative, prepend 'protocol//host'
      internalLinks = _.map(allLinks, function(link) {
        link = link.split('#')[0];
        if (link.indexOf(parsedStartUrl.host) > -1) {
          return link;
        } else {
          return link.charAt(0) === '/' ? parsedStartUrl.protocol + '//' + parsedStartUrl.host + link : '';
        }
      }),

      // Remove falsy and make unique again to account for relative links that are now absolute
      uniqueLinks = _.uniq(_.compact(internalLinks)),

      alreadyViewedQueryStrings = [],

      results = {};

  _.each(uniqueLinks, function(link) {

    // The query string (minus the ?)
    var queryString = (url.parse(link).search || '').slice(1);

    /* Continue if:
     * our link contains our passed in path
     * our link has a query string
     * we haven't seen this query string before
     */
    if (link.indexOf(urlPath) > -1 && queryString && !_.contains(alreadyViewedQueryStrings, queryString)) {

      alreadyViewedQueryStrings.push(queryString);

      // Decode html entities, +'s to spaces, pass to decodeURIComponent, then parse to an object with qs
      queryString = qs.parse(decodeURIComponent(ent.decode(queryString).replace(/\+/g, ' ')));

      /* For each key:value pair of the query string we either create a new array with the value
       * if we haven't seen that key before, or if we have then we push to that array.
       * Also, always make the value lowercase.
       */
      _.each(queryString, function(value, key) {
        if (typeof results[key] === 'undefined') {
          results[key] = [value.toLowerCase()];
        } else {
          results[key].push(value.toLowerCase());
        }
      });

      // Make each key only contain unique values in its array
      _.each(results, function(value, key) {
        results[key] = _.uniq(results[key]);
      });
    }
  });
  
  console.log(JSON.stringify(results, null, 4));

});


node_modules/
*.log
# Node Crawler to find all domain links on a site and run a function on them

Linked to from http://lukecod.es/2012/11/18/random-problem-of-the-night/

## What

This is a node.js crawler that will crawl an entire site (using [crawl](http://github.com/mmoulton/crawl)) to find all internal links in the entire site. It will then test each unique internal link for the presence of an optional string and then the query string into an object. All values with the same key from the query string will be pushed to an array for that key.

## Usage

- `npm install`
- `node app.js http://site-to-crawl.com /only/return/links/containing/this/path`

## Example Output

```
{
  'a': [
    'x',
    'y',
    'z'
  ],
  'b': [
    'c',
    'd'
  ],
  'z': [
    1
  ]
}
```

## How to get all possible clinical trial query parameters from biooncology.com
`node app.js http://www.biooncology.com /clinical-trials`

### Latest Output (11/18/12)

```
{
    "tumor": [
        "breast cancer",
        "cll",
        "dlbcl",
        "fnhl",
        "colorectal cancer",
        "gastric cancer",
        "glioblastoma",
        "lung cancer",
        "melanoma",
        "ovarian cancer",
        "multiple myeloma",
        "pancreatic cancer",
        "other tumor types",
        "renal cell carcinoma",
        "colon cancer",
        "liver cancer"
    ],
    "drug": [
        "pi3k inhibitor (gdc-0941)",
        "pi3k/mtor inhibitor (gdc-0980)",
        "obinutuzumab (ga101)",
        "onartuzumab (metmab)",
        "mek inhibitor (gdc-0973)",
        "akt inhibitor (gdc-0068)",
        "anti-egfl7",
        "dulanermin"
    ]
}
```

以上是关于markdown Node Crawler用于查找站点上的所有域链接并对其运行功能的主要内容,如果未能解决你的问题,请参考以下文章

扒一扒node爬虫框架node-crawler

markdown 用于在Docker容器中运行本地Node项目的个人备忘单

Node.js写爬虫系列之第1章

markdown 用于查找列表中最大数字的Python程序

crawler 使用jQuery风格实现

设置外部查找工具来索引 Confluence 6