text 此脚本可用于更深入地搜索应用程序中的“断开链接”，＃可提供大量免费服务。

Posted 2021-05-02

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了text 此脚本可用于更深入地搜索应用程序中的“断开链接”，＃可提供大量免费服务。相关的知识，希望对你有一定的参考价值。

# Used gem 'anemone', git: 'https://github.com/efrat-safanov/anemone.git', branch: 'next'
# You can use it as a Rails service or just run in Rails console once
# This script can take a lot of time for execution if your app has a big number of the pages
require 'nokogiri'
require 'net/http'
require 'net/https'
require 'uri'
require 'anemone'
require 'csv'

class FindBrokenLinksService
  ROOT = 'http://your-web-site.com'.freeze
  EXTENTIONS = %w(.jpg .jpeg .png .doc .pdf .js .css .xml .csv .exe .zip .gzip .rar).freeze
  # LinkedIN HTTP response always is 999 - it blocks all web-scripers
  BLOCKED_LINKS = %w(linkedin.com).freeze
  URL_REGEXP = /\A#{URI.regexp(%w(http https))}\z/

  def find_broken_links
    options = { discard_page_bodies: true, verbose: false, depth_limit: 10, links_limit: 100_000,
                pages_queue_limit: 200, read_timeout: 10, skip_query_strings: true }

    write_to_file(options)
    # It is better to check all links for uniqueness, because scraper can visit the same page twice
    remove_duplications
  end

  private

  def write_to_file(options)
    CSV.open('BrokenLinks-intermediate.csv', 'w') do |file|
      CSV.open("LogFile-#{DateTime.current}.csv", 'w') do |log|
        file << ['Code', 'Source', 'Link text', 'Link']

        started_at = Time.current

        start_crawler(options, log, file)

        finished_at = Time.current
        time_diff = finished_at - started_at

        write_script_execution_time(file, time_diff, started_at, finished_at)
        write_script_execution_time(log, time_diff, started_at, finished_at)
      end
    end
  end

  def start_crawler(options, log, file)
    Anemone.crawl(ROOT, options) do |anemone|
      # In order to prevent memory leak it is better to use 'storage' option
      # for checking app with big number of the links
      anemone.storage = Anemone::Storage.Redis
      # Put here examples of the pages, which shouldn't be checked
      anemone.skip_links_like %r{/users/auth}
      anemone.skip_links_like %r{/user/}
      anemone.skip_links_like %r{/documents/}
      anemone.skip_links_like /\.#{EXTENTIONS.join('|')}$/

      anemone.on_every_page do |page|
        check_every_page(page, file)
      end

      anemone.after_crawl do |pages|
        log << ['Error! Found only 1 page. Is the server down?'] if pages.size == 1
      end
    end
  end

  def check_every_page(page, file)
    links = page.doc.css('a')

    links.each do |link|
      current_link = link.attribute('href').to_s.strip
      next if current_link.blank?
      next if current_link.start_with?('mailto', 'javascript', 'tel', '/', '#')
      next if BLOCKED_LINKS.any? { |word| current_link && current_link.include?(word) }
      next if EXTENTIONS.any? { |exten| current_link && current_link.include?(exten) }

      if current_link !~ URL_REGEXP
        file << ['Wrong Format', "#{ROOT}#{page.url.path}", link.text.to_s,
                 link.attribute('href').to_s]
      elsif broken_external_link?(current_link)
        file << [broken_external_link?(current_link).to_s, "#{ROOT}#{page.url.path}",
                 link.text.to_s, link.attribute('href').to_s]
      end
    end

  rescue StandardError
    log << ['Exception', "#{ROOT}#{page.url.path}"]
  end

  def broken_external_link?(href)
    response = Net::HTTP.get_response(URI.parse(href))
    response.code unless response.code.to_i >= 200 && response.code.to_i < 400
  rescue StandardError
    return 'Unavailable'
  end

  def write_script_execution_time(file_name, time_diff, started_at, finished_at)
    file_name << ['']
    file_name << ['Started at', started_at.to_s]
    file_name << ['Finished at', finished_at.to_s]
    file_name << ['Execution time:', (Time.at(time_diff.to_i.abs).utc.strftime '%H:%M:%S').to_s]
  end

  def remove_duplications
    File.open("BrokenLinks-#{DateTime.current}.csv", 'w') do |f|
      f.puts File.readlines('BrokenLinks-intermediate.csv').uniq
    end

    File.delete('BrokenLinks-intermediate.csv')
  end
end

以上是关于text 此脚本可用于更深入地搜索应用程序中的“断开链接”，＃可提供大量免费服务。的主要内容，如果未能解决你的问题，请参考以下文章